# Load finalized dataset.
aoc <- read_csv("AquaticOrganisms_Clean_final.csv", guess_max = 10000) %>%
mutate(ID = paste0("ID",row_number()))
#### Introduction Setup ####
# All text inputs below.
#### Overview AO Setup ####
#Final_effect_dataset <- read_csv("Final_effect_dataset.csv")%>%
#mutate(plot_f = case_when(
#plot_f == "Polymer" ~ "Polymer",
#plot_f == "Size" ~ "Size",
#plot_f == "Shape" ~ "Shape",
#plot_f == "Organism" ~ "Organism",
#plot_f == "Lvl1" ~ "Endpoint Category",
#plot_f == "Life.stage" ~ "Life Stage",
#plot_f == "Invivo.invivo" ~ "In Vivo or In Vitro",
#plot_f == "Exposure.route" ~ "Exposure Route"))%>%
#mutate(plot_f = factor(plot_f))%>%
#mutate(logEndpoints = log(Endpoints))%>%
#rename(Percent = Freq)
polydf<-rowPerc(xtabs( ~polymer +effect, aoc)) #pulls polymers by effect
polyf<-as.data.frame(polydf)%>% #Makes data frame
filter(effect %in% c("Y","N"))%>% #Sorts into Yes and No
mutate(polymer = case_when(
polymer == "BIO" ~ "Biopolymer",
polymer == "EVA" ~ "Polyethylene Vinyl Acetate",
polymer == "LTX" ~ "Latex",
polymer == "PA" ~ "Polyamide",
polymer == "PE" ~ "Polyethylene",
polymer == "PC" ~ "Polycarbonate",
polymer == "PET" ~ "Polyethylene Terephthalate",
polymer == "PI" ~ "Polyisoprene",
polymer == "PMMA" ~ "Polymethylmethacrylate",
polymer == "PP" ~ "Polypropylene",
polymer == "PS" ~ "Polystyrene",
polymer == "PUR" ~ "Polyurathane",
polymer == "PVC" ~ "Polyvinylchloride",
polymer == "PLA" ~ "Polylactic Acid"))%>%
mutate_if(is.numeric, round,0) #rounds percents
Endpoints<-xtabs(~polymer +effect ,aoc) #Pulls all study obs. for polymer from dataset
polyfinal<- data.frame(cbind(polyf, Endpoints))%>% #adds it as a column
rename(Endpoints='Freq.1')%>% #renames column
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
sizedf<-rowPerc(xtabs(~size.category +effect, aoc))
sizef<-as.data.frame(sizedf)%>%
filter(effect %in% c("Y","N"))%>%
mutate(size.category = case_when(
size.category == 1 ~ "<1µm",
size.category == 2 ~ "1µm < 10µm",
size.category == 3 ~ "10µm < 100µm",
size.category == 4 ~ "100µm < 1mm",
size.category == 5 ~ "1mm < 5mm",
size.category == 0 ~ "unavailable"))%>%
rename(Type = "size.category")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Size")
study_s<-xtabs(~size.category +effect ,aoc)
sizefinal<- data.frame(cbind(sizef, study_s))%>%
rename(Endpoints='Freq.1')%>%
rename(category='size.category')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
shapedf<-rowPerc(xtabs(~shape + effect, aoc))
shapef<-as.data.frame(shapedf)%>%
filter(effect %in% c("Y","N"))%>%
rename(Type="shape")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Shape")%>%
mutate(Type = case_when(
Type == "cube" ~ "Cube",
Type == "sphere" ~ "Sphere",
Type == "fragment" ~ "Fragment",
Type == "fiber" ~ "Fiber"))
study_sh<-xtabs(~shape + effect,aoc)
shapefinal<- data.frame(cbind(shapef, study_sh))%>%
rename(Endpoints='Freq.1')%>%
rename(category='shape')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
taxdf<-rowPerc(xtabs(~organism.group +effect, aoc))
taxf<-as.data.frame(taxdf)%>%
filter(effect %in% c("Y","N"))%>%
rename(Type= "organism.group")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Organism")
study_t<-xtabs(~organism.group +effect,aoc)
taxfinal<- data.frame(cbind(taxf, study_t))%>%
rename(Endpoints='Freq.1')%>%
rename(category='organism.group')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
lvl1df<-rowPerc(xtabs(~lvl1 +effect, aoc))
lvl1f<-as.data.frame(lvl1df)%>%
filter(effect %in% c("Y","N"))%>%
rename(Type= "lvl1")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Lvl1")%>%
mutate(Type = case_when(
Type == "alimentary.excretory" ~ "Alimentary, Excretory",
Type == "behavioral.sense.neuro" ~ "Behavioral, Sensory, Neurological",
Type == "circulatory.respiratory" ~ "Circulatory, Respiratory",
Type == "community" ~ "Community",
Type == "fitness" ~ "Fitness",
Type == "immune" ~ "Immune",
Type == "metabolism" ~ "Metabolism",
Type == "microbiome" ~ "Microbiome",
Type == "stress" ~ "Stress"))
study_l<-xtabs(~lvl1 +effect,aoc)
lvl1final<- data.frame(cbind(lvl1f, study_l))%>%
rename(Endpoints='Freq.1')%>%
rename(category='lvl1')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
lifedf<-rowPerc(xtabs(~life.stage +effect, aoc))
lifef<-as.data.frame(lifedf)%>%
filter(effect %in% c("Y","N"))%>%
rename(Type= "life.stage")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Life.stage")
studyli<-xtabs(~life.stage +effect ,aoc)
lifefinal<- data.frame(cbind(lifef, studyli))%>%
rename(Endpoints='Freq.1')%>%
rename(category='life.stage')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
vivodf<-rowPerc(xtabs(~invitro.invivo +effect, aoc))
vivof<-as.data.frame(vivodf)%>%
filter(effect %in% c("Y","N"))%>%
rename(Type= "invitro.invivo")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Invivo.invivo")%>%
mutate(Type = case_when(
Type=="invivo"~"In Vivo",
Type=="invitro"~"In Vitro"))
study_v<-xtabs(~invitro.invivo +effect,aoc)
vivofinal<- data.frame(cbind(vivof, study_v))%>%
rename(Endpoints='Freq.1')%>%
rename(category='invitro.invivo')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
routedf<-rowPerc(xtabs(~exposure.route +effect, aoc))
routef<-as.data.frame(routedf)%>%
filter(effect %in% c("Y","N"))%>%
rename(Type= "exposure.route")%>%
mutate_if(is.numeric, round,0)%>%
mutate(plot="Exposure.route")%>%
mutate(Type = case_when(
Type == "coparental.exposure" ~"Co-Parental Exposure",
Type == "paternal.exposure" ~ "Paternal Exposure",
Type == "maternal.exposure" ~ "Maternal Exposure",
Type == "food" ~ "Food",
Type == "water" ~ "Water",
Type == "sediment" ~ "Sediment",
Type == "media" ~ "Media"))
study_r<-xtabs(~exposure.route +effect,aoc)
routefinal<- data.frame(cbind(routef, study_r))%>%
rename(Endpoints='Freq.1')%>%
rename(category='exposure.route')%>%
mutate(logEndpoints = log(Endpoints))%>%
rename(Percent = Freq)#renames column
#### Exploration AO Setup ####
# Master dataset for scatterplots - for Heili's tab.
aoc_v1 <- aoc %>% # start with original dataset
# full dataset filters.
mutate(effect_f = factor(case_when(effect == "Y" ~ "Yes",
effect == "N" ~ "No"),
levels = c("No", "Yes"))) %>%
# removing NAs to make data set nicer
replace_na(list(size.category = 0, shape = "Not Reported", polymer = "Not Reported", life.stage = "Not Reported"))
aoc_setup <- aoc_v1 %>% # start with original dataset
mutate(size_f = factor(case_when(
size.category == 1 ~ "1nm < 100nm",
size.category == 2 ~ "100nm < 1µm",
size.category == 3 ~ "1µm < 100µm",
size.category == 4 ~ "100µm < 1mm",
size.category == 5 ~ "1mm < 5mm",
size.category == 0 ~ "Not Reported"),
levels = c("1nm < 100nm", "100nm < 1µm", "1µm < 100µm", "100µm < 1mm", "1mm < 5mm", "Not Reported"))) %>% # creates new column with nicer names and order by size levels.
# shape category data tidying.
mutate(shape_f = factor(case_when(
shape == "fiber" ~ "Fiber",
shape == "fragment" ~ "Fragment",
shape == "sphere" ~ "Sphere",
shape == "Not Reported" ~ "Not Reported"),
levels = c("Fiber", "Fragment", "Sphere", "Not Reported"))) %>% # order our different shapes.
# polymer category data tidying.
mutate(poly_f = factor(case_when(
polymer == "BIO" ~ "Biopolymer",
polymer == "EVA" ~ "Polyethylene Vinyl Acetate",
polymer == "LTX" ~ "Latex",
polymer == "PA" ~ "Polyamide",
polymer == "PE" ~ "Polyethylene",
polymer == "PC" ~ "Polycarbonate",
polymer == "PET" ~ "Polyethylene Terephthalate",
polymer == "PI" ~ "Polyisoprene",
polymer == "PMMA" ~ "Polymethylmethacrylate",
polymer == "PP" ~ "Polypropylene",
polymer == "PS" ~ "Polystyrene",
polymer == "PUR" ~ "Polyurathane",
polymer == "PVC" ~ "Polyvinylchloride",
polymer == "PLA" ~ "Polylactic Acid",
polymer == "Not Reported" ~ "Not Reported"))) %>%
# taxonomic category data tidying.
mutate(org_f = factor(organism.group, levels = c("Algae", "Annelida", "Bacterium", "Cnidaria", "Crustacea",
"Echinoderm", "Fish", "Insect", "Mollusca", "Nematoda", "Plant", "Rotifera", "Mixed"))) %>% # order our different organisms.
mutate(lvl1_f = factor(case_when(lvl1 == "alimentary.excretory" ~ "Alimentary, Excretory",
lvl1 == "behavioral.sense.neuro" ~ "Behavioral, Sensory, Neurological",
lvl1 == "circulatory.respiratory" ~ "Circulatory, Respiratory",
lvl1 == "community" ~ "Community",
lvl1 == "fitness" ~ "Fitness",
lvl1 == "immune" ~ "Immune",
lvl1 == "metabolism" ~ "Metabolism",
lvl1 == "microbiome" ~ "Microbiome",
lvl1 == "stress" ~ "Stress"))) %>% # creates new column with nicer names.
# Level 2 Data tidying
mutate(lvl2_f = factor(case_when(lvl2 == "abundance"~"Abundance",
lvl2 == "actinobacteria" ~ "Actinobacteria",
lvl2 == "aggressivity"~"Agressivity",
lvl2 == "ammonia.excretion" ~ "Ammonia Excretion",
lvl2 == "bacteroidetes"~ "Bacteriodetes",
lvl2 == "blood"~"Blood",
lvl2 == "body.condition"~"Body Condition",
lvl2 == "boldness"~"Boldness",
lvl2 == "brain.histo"~"Brain Histological Abnormalities",
lvl2 == "burrowing"~"Burrowing",
lvl2 == "carb.metabolism"~"Carb Metabolism",
lvl2 == "chemokines.cytokines"~"Chemokines",
lvl2 == "circulatory"~"Circulatory",
lvl2 == "detoxification"~"Detoxification",
lvl2 == "development"~"Development",
lvl2 == "digestion"~"Digestion",
lvl2 == "digestive.enzymes"~"Digestive Enzymes",
lvl2 == "digestive.tract.histo"~"Digestive Tract Histological Abnormalities",
lvl2 == "diversity"~ "Diversity",
lvl2 == "feeding"~ "Feeding",
lvl2 == "firmicutes"~ "Firmicutes",
lvl2 == "gall.bladder.histo" ~ "Gall Bladder Histological Abnormalities",
lvl2 == "gen.metabolism"~ "General Metabolism",
lvl2 == "gill.histo"~ "Gill Histological Abnormalities",
lvl2 == "gonad.histo"~"Gonad Histological Abnormalities",
lvl2 == "growth"~ "Growth",
lvl2 == "immune.cells"~"Immune Cells",
lvl2 == "immune.other"~"Immune Other ",
lvl2 == "intestinal.permeability"~"Intestinal Permeability",
lvl2 == "kidney.histo"~"Kidney Histological abnormalities",
lvl2 == "lipid.metabolism"~"Lipid Metabolism",
lvl2 == "liver.histo"~"Liver Histological Abnormalities",
lvl2 == "liver.kidney.products" ~ "Liver and Kidney Products",
lvl2 == "locomotion"~"Locomotion",
lvl2 == "mortality"~"Mortality",
lvl2 == "nervous.system"~"Nervous System",
lvl2 == "oxidative.stress"~"Oxidative Stress",
lvl2 == "photosynthesis"~ "Photosynthesis",
lvl2 == "proteobacteria"~"Protebacteria",
lvl2 == "reproduction"~"Reproduction",
lvl2 == "respiration"~"Respiration",
lvl2 == "sexhormones"~"Sex Hormones",
lvl2 == "shoaling"~"Shoaling",
lvl2 == "stress"~"Stress",
lvl2 == "vision.system"~"Vision System"))) %>% #Renames for widget
mutate(bio_f = factor(case_when(bio.org == "cell"~"Cell", #Bio Org Data Tidying
bio.org == "organism"~"Organism",
bio.org == "population"~ "Population",
bio.org == "subcell"~"Subcell",
bio.org == "tissue" ~ "Tissue")))%>%
mutate(vivo_f = factor(case_when(invitro.invivo == "invivo"~"In Vivo",
invitro.invivo == "invitro"~"In Vitro")))%>% ##Renames for widget (Not using a widget right now, but saving for human health database)
mutate(life_f = factor(case_when(life.stage == "Early"~"Early",
life.stage == "Juvenile"~"Juvenile",
life.stage == "Adult"~"Adult",
life.stage == "Not Reported"~"Not Reported")))%>% #Renames for widget
mutate(env_f = factor(case_when(environment == "Freshwater"~"Freshwater",
environment == "Marine" ~ "Marine",
environment == "Terrestrial" ~ "Terrestrial"))) %>%
mutate(dose.mg.L.master.converted.reported = factor(dose.mg.L.master.converted.reported)) %>%
mutate(dose.particles.mL.master.converted.reported = factor(dose.particles.mL.master.converted.reported)) %>%
mutate(dose.um3.mL.master = particle.volume.um3 * dose.particles.mL.master) %>% #calculate volume/mL
mutate(af.time_noNA = replace_na(af.time, "Unavailable")) %>%
mutate(acute.chronic_f = factor(case_when(af.time_noNA == 10 ~ "Acute",
af.time_noNA == 1 ~ "Chronic",
af.time_noNA == "Unavailable" ~ "Unavailable"))) %>% #factorize assesment factor time into chronic/acute
mutate(dose.mg.L.master.AF.noec = dose.mg.L.master * af.noec) %>%
mutate(dose.particles.mL.master.AF.noec = dose.particles.mL.master * af.noec) %>%
mutate(effect_f = factor(effect)) %>%
mutate_if(is.character, as.factor) %>%
mutate(effect_10 = case_when(
effect_f == "Y" ~ 1,
effect_f == "N" ~ 0))
#### SSD AO Setup ####
# Master dataset for SSDs
aoc_z <- aoc_setup %>% # start with Heili's altered dataset (no filtration for terrestrial data)
# environment category data tidying.
mutate(environment.noNA = replace_na(environment, "Not Reported")) %>% # replaces NA to better relabel.
mutate(env_f = factor(environment.noNA, levels = c("Marine", "Freshwater", "Terrestrial", "Not Reported")))
# final cleanup and factoring
aoc_z$Species <- as.factor(paste(aoc_z$genus,aoc_z$species)) #must make value 'Species" (uppercase)
aoc_z$Group <- as.factor(aoc_z$organism.group) #must make value "Group"
aoc_z$Group <- fct_explicit_na(aoc_z$Group) #makes sure that species get counted even if they're missing a group
# subset data to selected variables
multiVar <- aoc_z %>% dplyr::select(#doi, size.category,
size_f,
size.length.um.used.for.conversions,
shape,
polymer,
particle.volume.um3,
density.mg.um.3,
organism.group,
environment,
bio.org, #biological level of organization
#af.time, #assessment factor based on exposure time
treatments, #number of doses (no including control)
effect, #yes no
effect_f,
effect_10,
size_f,
exposure.duration.d,
exposure.route, #Factor
organism.group, #factor
media.temp, #numeric
lvl1_f, #endpoints
lvl2_f, #endpoints
# lvl3,
dose.mg.L.master,
sex, #factor
media.ph, #numeric
media.sal.ppt, #numeric
dose.particles.mL.master,
effect.metric, #NOEC LOEC
functional.group, #factor
charge, #positive or negatibe
zetapotential.mV, # numeric
max.size.ingest.mm,#max ingestible size
acute.chronic_f,
dose.mg.L.master.AF.noec,
dose.particles.mL.master.AF.noec,
max.size.ingest.mm, #maximum ingestible size range
effect.score) %>% #1 = minor, 2 = photosynthesis, feeding, 3 = growth, chlorophyll content, 4 = reproduction, 5 = population growth, 6 = survival
filter(!size_f == "Not Reported") %>% #take out not reported
# max.size.ingest.mm) %>% #max ingestible size
filter(!size_f == "Not Reported") #take out not reported
#recode variables
# multiVar <- multiVar %>% mutate(effect_10 = case_when(
# effect == "Y" ~ 1,
# effect == "N" ~ 0
# ))# %>%
# #mutate_all(is.character, ~as.factor())
CompletenessSummary_size <- multiVar %>%
group_by(size_f) %>%
summarise_all((name = ~sum(is.na(.))/length(.))) %>%
mutate(across(is.numeric,~round(., 2))) %>%
mutate(across(is.numeric, ~100 *(1 -.)))
CompletenessSummary_size
## # A tibble: 5 x 32
## size_f size.length.um.~ shape polymer particle.volume~ density.mg.um.3
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1nm <~ 100 100 100 98 100
## 2 100nm~ 100 100 100 99 100
## 3 1µm <~ 100 100 100 98 94
## 4 100µm~ 100 100 100 95 97
## 5 1mm <~ 100 100 100 100 89
## # ... with 26 more variables: organism.group <dbl>, environment <dbl>,
## # bio.org <dbl>, treatments <dbl>, effect <dbl>, effect_f <dbl>,
## # effect_10 <dbl>, exposure.duration.d <dbl>, exposure.route <dbl>,
## # media.temp <dbl>, lvl1_f <dbl>, lvl2_f <dbl>, dose.mg.L.master <dbl>,
## # sex <dbl>, media.ph <dbl>, media.sal.ppt <dbl>,
## # dose.particles.mL.master <dbl>, effect.metric <dbl>,
## # functional.group <dbl>, charge <dbl>, zetapotential.mV <dbl>,
## # max.size.ingest.mm <dbl>, acute.chronic_f <dbl>,
## # dose.mg.L.master.AF.noec <dbl>, dose.particles.mL.master.AF.noec <dbl>,
## # effect.score <dbl>
require(pheatmap)
#convert to matrix and transpose
transposed_size <- as.data.frame(t(as.matrix(CompletenessSummary_size[2:25]))) %>% #1 is category, 2-6 are variables
arrange('1nm < 100nm')
#reassign column names
colnames(transposed_size) <- c("1nm < 100nm", "100nm < 1µm", "1µm < 100µm", "100µm < 1mm", "1mm < 5mm")
#transposedTag
#format as matrix
MissingMatrix_size <- data.matrix(transposed_size)
#build heatmap
pheatmap(MissingMatrix_size,
main = "Data Completeness by Size Bin", #title
fontsize = 12,
cluster_rows = FALSE, cluster_cols = FALSE,#disable dendrograms
display_numbers = TRUE,
treeheight_row = 0, treeheight_col = 0, #keeps clustering after dropping dendrograms
col = rev(brewer.pal(n = 9, name = "PuBu"))) #blue color scheme with 9 colors)
### Completeness by Environment
CompletenessSummary_environment <- multiVar %>%
filter(!environment == "NA") %>%
group_by(environment) %>%
summarise_all((name = ~sum(is.na(.))/length(.))) %>%
mutate(across(is.numeric,~round(., 2))) %>%
mutate(across(is.numeric, ~100 *(1 -.)))
CompletenessSummary_environment
## # A tibble: 3 x 32
## environment size_f size.length.um.~ shape polymer particle.volume~
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Freshwater 100 100 100 100 99
## 2 Marine 100 100 100 100 95
## 3 Terrestrial 100 100 100 100 100
## # ... with 26 more variables: density.mg.um.3 <dbl>, organism.group <dbl>,
## # bio.org <dbl>, treatments <dbl>, effect <dbl>, effect_f <dbl>,
## # effect_10 <dbl>, exposure.duration.d <dbl>, exposure.route <dbl>,
## # media.temp <dbl>, lvl1_f <dbl>, lvl2_f <dbl>, dose.mg.L.master <dbl>,
## # sex <dbl>, media.ph <dbl>, media.sal.ppt <dbl>,
## # dose.particles.mL.master <dbl>, effect.metric <dbl>,
## # functional.group <dbl>, charge <dbl>, zetapotential.mV <dbl>,
## # max.size.ingest.mm <dbl>, acute.chronic_f <dbl>,
## # dose.mg.L.master.AF.noec <dbl>, dose.particles.mL.master.AF.noec <dbl>,
## # effect.score <dbl>
require(pheatmap)
#convert to matrix and transpose
transposed_environment <- as.data.frame(t(as.matrix(CompletenessSummary_environment[2:25])))
#reassign column names
colnames(transposed_environment) <- c("Freshwater", "Marine", "Terrestrial")
#transposedTag
#format as matrix
MissingMatrix_environment <- data.matrix(transposed_environment)
#build heatmap
pheatmap(MissingMatrix_environment,
main = "Data Completeness by environment", #title
fontsize = 12,
cluster_rows = FALSE, cluster_cols = FALSE,#disable dendrograms
display_numbers = TRUE,
treeheight_row = 0, treeheight_col = 0, #keeps clustering after dropping dendrograms
col = rev(brewer.pal(n = 9, name = "PuBu"))) #blue color scheme with 9 colors)
## Estimate an OLS Regression
fitols <- lm(effect_10 ~ size.length.um.used.for.conversions + particle.volume.um3 + exposure.duration.d + media.temp + dose.particles.mL.master,
na.action = na.omit,
data = multiVar)
summary(fitols)
##
## Call:
## lm(formula = effect_10 ~ size.length.um.used.for.conversions +
## particle.volume.um3 + exposure.duration.d + media.temp +
## dose.particles.mL.master, data = multiVar, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5545 -0.3385 -0.3091 0.6448 0.8356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.104e-01 3.675e-02 5.727 1.10e-08 ***
## size.length.um.used.for.conversions -1.251e-04 2.923e-05 -4.280 1.91e-05 ***
## particle.volume.um3 4.504e-11 9.431e-12 4.776 1.85e-06 ***
## exposure.duration.d 9.134e-04 3.120e-04 2.927 0.00344 **
## media.temp 5.048e-03 1.691e-03 2.986 0.00284 **
## dose.particles.mL.master 4.778e-20 1.957e-20 2.441 0.01469 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4682 on 3951 degrees of freedom
## (1968 observations deleted due to missingness)
## Multiple R-squared: 0.01223, Adjusted R-squared: 0.01098
## F-statistic: 9.787 on 5 and 3951 DF, p-value: 2.612e-09
require(reshape2)
multiVar %>%
dplyr::select(size.length.um.used.for.conversions, particle.volume.um3, exposure.duration.d ,media.temp, dose.particles.mL.master) %>%
melt() %>% #convert wide to long
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
ggplot(aes(x = value)) +
stat_density() +
facet_wrap(~variable, scales = "free")
require(uwot)
require(Rtsne)
require(vizier) #devtools::install_github("jlmelville/vizier")
# For some functions we need to strip out non-numeric columns and convert data to matrix
x2m <- function(X) {
if (!methods::is(X, "matrix")) {
m <- as.matrix(X[, which(vapply(X, is.numeric, logical(1)))])
}
else {m <- X}
m}
#choose values with most completeness
multiVar2 <- multiVar %>%
filter(!environment == "Terrestrial") %>%
dplyr::select(size_f, size.length.um.used.for.conversions, shape, polymer, particle.volume.um3, density.mg.um.3, organism.group, bio.org, treatments, effect, exposure.duration.d, exposure.route, lvl1_f, dose.mg.L.master) %>%
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
drop_na() #drop missing
#convert discrete variables to numeric
multiVar2[] <- data.matrix(multiVar2)
# build umap for small dataset (<10,000 points)
multiVar_map <- umap(multiVar2, pca = 10)
# Remove duplicates for t-SNE
#multiVar2_noNa_dup <- multiVar2_noNa[-which(duplicated(x2m(multiVar2_noNa))), ]
#build t-SNE
#multiVar_tsne <- Rtsne::Rtsne(multiVar2_noNa_dup, perplexity = 15, initial_dims = 100, partical_pca = TRUE, exaggeration_factor = 4)
# Non-numeric columns are ignored, so in a lot of cases you can pass a data
# frame directly to umap
#iris_umap <- umap(iris, n_neighbors = 50, learning_rate = 0.5, init = "random")
#visualize umap
embed_img <- function(X, Y, k = 15, ...) {
args <- list(...)
args$coords <- Y
args$x <- X
do.call(vizier::embed_plot, args)
}
#plot
embed_img(multiVar2, multiVar_map, pc_axes = TRUE, equal_axes = TRUE, alpha_scale = 0.5, title = "Tox UMAP", cex = 1)
#PCA
pca <- stats::prcomp(multiVar2[,-5], retx = TRUE, rank. = 2)
#build color pallete
my_colors = colorRampPalette(c("red", "yellow", "green"))(nrow(multiVar2))
#plot
embed_plot(pca$x, multiVar2$polymer, color_scheme = palette.colors(palette = "Okabe-Ito"), #turbo, #rainbow, #my_colors,
title = "Polymer PCA", alpha_scale = 0.5, equal_axes = TRUE)
require(plotly)
#PCA for discrete variables
multiVar_discrete <- multiVar %>%
select(size.length.um.used.for.conversions, polymer, dose.mg.L.master, exposure.duration.d) %>%
drop_na %>%
mutate_if(~is.numeric(.) && (.) > 0, log10)
#buildPCA
pca_discrete <- stats::prcomp(multiVar_discrete[,-2], retx = TRUE, rank. = 2)
embed_plotly(pca_discrete$x, multiVar_discrete$polymer, color_scheme = palette.colors(palette = "Okabe-Ito"),
title = "Polymer PCA", alpha_scale = 0.5, equal_axes = TRUE,
tooltip = paste("Polymer:", multiVar_discrete$polymer))
The rpart algorithm works by splitting the dataset recursively, which means that the subsets that arise from a split are further split until a predetermined termination criterion is reached. At each step, the split is made based on the independent variable that results in the largest possible reduction in heterogeneity of the dependent (predicted) variable.
It is important to note that the algorithm works by making the best possible choice at each particular stage, without any consideration of whether those choices remain optimal in future stages. That is, the algorithm makes a locally optimal decision at each stage. It is thus quite possible that such a choice at one stage turns out to be sub-optimal in the overall scheme of things. In other words, the algorithm does not find a globally optimal tree.
#trim data so effect is always known
multiVar_sub <- multiVar %>%
drop_na(effect_10)
# Split data into training and test sets
set.seed(42)
multiVar_sub[,"train"] <- ifelse(runif(nrow(multiVar_sub)) < 0.8, 1, 0)
# Separate trainig and test sets
trainSet <- multiVar_sub[multiVar_sub$train==1,]
testSet <- multiVar_sub[multiVar_sub$train==0,]
#get column index of train flag
trainColNum <- grep("train", names(trainSet))
# Remove train flag column from train and test sets
trainSet <- trainSet[, -trainColNum]
testSet <- testSet[, -trainColNum]
Make a classification tree.
## n= 4649
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4649 1330 N (0.7139 0.2861)
## 2) size.length.um.used.for.conversions>=26.5 2065 464 N (0.7753 0.2247) *
## 3) size.length.um.used.for.conversions< 26.5 2584 866 N (0.6649 0.3351)
## 6) particle.volume.um3< 493.4 2078 616 N (0.7036 0.2964)
## 12) dose.particles.mL.master< 3.429e+06 1232 290 N (0.7646 0.2354) *
## 13) dose.particles.mL.master>=3.429e+06 846 326 N (0.6147 0.3853)
## 26) size.length.um.used.for.conversions>=0.02914 805 292 N (0.6373 0.3627) *
## 27) size.length.um.used.for.conversions< 0.02914 41 7 Y (0.1707 0.8293) *
## 7) particle.volume.um3>=493.4 506 250 N (0.5059 0.4941)
## 14) dose.particles.mL.master< 2.5 103 30 N (0.7087 0.2913) *
## 15) dose.particles.mL.master>=2.5 403 183 Y (0.4541 0.5459)
## 30) exposure.duration.d< 52.5 203 83 N (0.5911 0.4089)
## 60) particle.volume.um3< 1191 86 16 N (0.8140 0.1860) *
## 61) particle.volume.um3>=1191 117 50 Y (0.4274 0.5726)
## 122) particle.volume.um3>=3725 30 5 N (0.8333 0.1667) *
## 123) particle.volume.um3< 3725 87 25 Y (0.2874 0.7126) *
## 31) exposure.duration.d>=52.5 200 63 Y (0.3150 0.6850) *
Plot an interpretable tree.
## cex 1 xlim c(-0.2, 1.2) ylim c(0, 1)
Next we see how good the model is by seeing how it fares against the test data.
t1_predict <- predict(t1, newdata = testSet[,-typeColNum],
type="class")
mean(t1_predict==testSet$effect)
## [1] 0.7094088
# [1] 0.7094088
#confusion matrix
table(pred=t1_predict,true=testSet$effect)
## true
## pred N Y
## N 792 323
## Y 26 60
par(mfrow=c(1,2)) # two plots on one page
#plot approximate R-squared and relative error for different splits (2 plots). labels are only appropriate for the "anova" method.
rsq.rpart(t1)
##
## Classification tree:
## rpart(formula = effect ~ size.length.um.used.for.conversions +
## particle.volume.um3 + exposure.duration.d + media.temp +
## dose.particles.mL.master, data = trainSet, method = "class",
## control = rpart.control(minbucket = 20, cp = 0.008))
##
## Variables actually used in tree construction:
## [1] dose.particles.mL.master exposure.duration.d
## [3] particle.volume.um3 size.length.um.used.for.conversions
##
## Root node error: 1330/4649 = 0.28608
##
## n= 4649
##
## CP nsplit rel error xerror xstd
## 1 0.01391 0 1.00000 1.00000 0.023169
## 2 0.01015 6 0.91654 0.94060 0.022736
## 3 0.00800 8 0.89624 0.91729 0.022554
Next, we prune the tree using the cost complexity criterion. Basically, the intent is to see if a shallower subtree can give us comparable results. If so, we’d be better of choosing the shallower tree because it reduces the likelihood of overfitting.
As described earlier, we choose the appropriate pruning parameter (aka cost-complexity parameter) by picking the value that results in the lowest prediction error. Note that all relevant computations have already been carried out by R when we built the original tree (the call to rpart in the code above). All that remains now is to pick the value of :
#cost-complexity pruning
printcp(t1)
##
## Classification tree:
## rpart(formula = effect ~ size.length.um.used.for.conversions +
## particle.volume.um3 + exposure.duration.d + media.temp +
## dose.particles.mL.master, data = trainSet, method = "class",
## control = rpart.control(minbucket = 20, cp = 0.008))
##
## Variables actually used in tree construction:
## [1] dose.particles.mL.master exposure.duration.d
## [3] particle.volume.um3 size.length.um.used.for.conversions
##
## Root node error: 1330/4649 = 0.28608
##
## n= 4649
##
## CP nsplit rel error xerror xstd
## 1 0.01391 0 1.00000 1.00000 0.023169
## 2 0.01015 6 0.91654 0.94060 0.022736
## 3 0.00800 8 0.89624 0.91729 0.022554
It is clear from the above, that the lowest cross-validation error (xerror in the table) occurs for =0.008 (this is CP in the table above). One can find CP programatically like so:
# get index of CP with lowest xerror
opt <- which.min(t1$cptable[,"xerror"])
#get its value
cp <- t1$cptable[opt, "CP"]
Next, we prune the tree based on this value of CP:
# # prune the tree
# pt1 <- prune(t1,cp)
# pt1<- prune(t1, cp= t1$cptable[which.min(t1$cptable[,"xerror"]),"CP"])
#
# # plot the pruned tree
# plot(pt1, uniform=TRUE,
# main="Pruned Classification Tree");text(pt1, use.n=TRUE, all=TRUE, cex=.8)
#
# #post(pfit, file = "c:/ptree.ps",
# # title = "Pruned Classification Tree for Kyphosis")
# #find proportion of correct predictions using test set
# t1_pruned_predict <- predict(pt1, testSet, type="class")
# mean(t1_pruned_predict == testSet$effect)
# #
This is not an improvement over an unprunend tree.. We need to check that this holds up for different training and test sets. This is easily done by creating multiple random partitions of the dataset and checking the efficacy of pruning for each. To do this efficiently, I’ll create a function that takes the training fraction, number of runs (partitions) and the name of the dataset as inputs and outputs the proportion of correct predictions for each run. It also optionally prunes the tree.
# <!-- # #function to do multiple runs -->
# <!-- # multiple_runs_classification <- function(train_fraction,n,dataset,prune_tree=FALSE){ -->
# <!-- # fraction_correct <- rep(NA,n) -->
# <!-- # set.seed(42) -->
# <!-- # for (i in 1:n){ -->
# <!-- # dataset[,"train"] <- ifelse(runif(nrow(dataset))<0.8,1,0) -->
# <!-- # trainColNum <- grep("train",names(dataset)) -->
# <!-- # typeColNum <- grep("effect",names(dataset)) -->
# <!-- # trainSet <- dataset[dataset$train==1,-trainColNum] -->
# <!-- # testSet <- dataset[dataset$train==0,-trainColNum] -->
# <!-- # rpart_model <- rpart(effect~ size.length.um.used.for.conversions + particle.volume.um3 + exposure.duration.d + media.temp + dose.particles.mL.master,data = trainSet, method="class") -->
# <!-- # if(prune_tree==FALSE) { -->
# <!-- # rpart_test_predict <- predict(rpart_model,testSet[,-typeColNum],type="class") -->
# <!-- # fraction_correct[i] <- mean(rpart_test_predict==testSet$effect) -->
# <!-- # }else{ -->
# <!-- # opt <- which.min(rpart_model$cptable[,"xerror"]) -->
# <!-- # cp <- rpart_model$cptable[opt, "CP"] -->
# <!-- # pruned_model <- prune(rpart_model,cp) -->
# <!-- # rpart_pruned_predict <- predict(pruned_model,testSet[,-typeColNum],type="class") -->
# <!-- # fraction_correct[i] <- mean(rpart_pruned_predict == testSet$effect) -->
# <!-- # } -->
# <!-- # } -->
# <!-- # return(fraction_correct) -->
# <!-- # } -->
# <!-- # ``` -->
#
# <!-- # Note that in the above, I have set the default value of the prune_tree to FALSE, so the function will execute the first branch of the if statement unless the default is overridden. -->
# <!-- # -->
# <!-- # OK, so let’s do 50 runs with and without pruning, and check the mean and variance of the results for both sets of runs. -->
# <!-- # ```{r} -->
# <!-- # #50 runs, no pruning -->
# <!-- # unpruned_set <- multiple_runs_classification(0.8,50,multiVar_sub) -->
# <!-- # mean(unpruned_set) -->
# <!-- # #[1] 0.7261882 -->
# <!-- # sd(unpruned_set) -->
# <!-- # #[1] 0.01554347 -->
# <!-- # #50 runs, with pruning -->
# <!-- # pruned_set <- multiple_runs_classification(0.8,50,multiVar_sub,prune_tree=TRUE) -->
# <!-- # mean(pruned_set) -->
# <!-- # #[1] 0.7261875 -->
# <!-- # sd(pruned_set) -->
# <!-- # #[1] 0.01552285 -->
# <!-- # ``` -->
require(party)
crf <- cforest(as.factor(effect_10) ~ size.length.um.used.for.conversions + particle.volume.um3 + exposure.duration.d +
media.temp + dose.particles.mL.master,
controls = cforest_control(ntree = 500,
mincriterion = qnorm(0.8),
trace = TRUE), # adds project bar because it's very slow
data = multiVar_sub)
##
## [> ] 0% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[=> ] 2% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[==> ] 4% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[===> ] 6% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[====> ] 8% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[=====> ] 10% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[======> ] 12% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[=======> ] 14% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[========> ] 16% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[=========> ] 18% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[==========> ] 20% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[===========> ] 22% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[============> ] 24% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[=============> ] 26% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[==============> ] 28% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[===============> ] 30% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[================> ] 32% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[=================> ] 34% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[==================> ] 36% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[===================> ] 38% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[====================> ] 40% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[=====================> ] 42% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[======================> ] 44% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[=======================> ] 46% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[========================> ] 48% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[=========================> ] 50% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[==========================> ] 52% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[===========================> ] 54% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[============================> ] 56% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[=============================> ] 58% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[==============================> ] 60% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[===============================> ] 62% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[================================> ] 64% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[=================================> ] 66% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[==================================> ] 68% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[===================================> ] 70% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[====================================> ] 72% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[=====================================> ] 74% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[======================================> ] 76% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[=======================================> ] 78% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[========================================> ] 80% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[=========================================> ] 82% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[==========================================> ] 84% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[===========================================> ] 86% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[============================================> ] 88% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[=============================================> ] 90% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[==============================================> ] 92% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[===============================================> ] 94% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[================================================> ] 96% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[=================================================> ] 98% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
[==================================================>] 100% completed
crf
##
## Random Forest using Conditional Inference Trees
##
## Number of trees: 500
##
## Response: as.factor(effect_10)
## Inputs: size.length.um.used.for.conversions, particle.volume.um3, exposure.duration.d, media.temp, dose.particles.mL.master
## Number of observations: 5850
train2 <- multiVar_sub %>% dplyr::select(size.length.um.used.for.conversions,
particle.volume.um3,exposure.duration.d,media.temp,dose.particles.mL.master)
train3 <-multiVar_sub %>% dplyr::select(size.length.um.used.for.conversions,
particle.volume.um3,exposure.duration.d,media.temp,dose.particles.mL.master,
effect_10)
fitted <- predict(crf, train2, OOB = TRUE, type ="response")
#rpart.prob <- predict(t1, newdata=imputedSmalls.requested.voluntary,type="prob")
misClasificError <- mean(fitted != train3$effect_10)
print(paste('Training Accuracy', 1 - misClasificError))
## [1] "Training Accuracy 0.792649572649573"
print(crf)
##
## Random Forest using Conditional Inference Trees
##
## Number of trees: 500
##
## Response: as.factor(effect_10)
## Inputs: size.length.um.used.for.conversions, particle.volume.um3, exposure.duration.d, media.temp, dose.particles.mL.master
## Number of observations: 5850
Alternative ROC Curve
require(pROC)
predicted <- predict(crf, train2, OOB=TRUE, type= "response")
auc(as.numeric(train3$effect_10), as.numeric(predicted))
## [1] 0.7071698
#Calculate ROC curve
rocCurve.tree <- roc(train3$effect_10,as.numeric(predicted))
#plot the ROC curve
plot(rocCurve.tree,col=c(4))
# compute in-sample results
caret::confusionMatrix(fitted,as.factor(train3$effect_10))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3779 855
## 1 358 858
##
## Accuracy : 0.7926
## 95% CI : (0.782, 0.803)
## No Information Rate : 0.7072
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4528
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9135
## Specificity : 0.5009
## Pos Pred Value : 0.8155
## Neg Pred Value : 0.7056
## Prevalence : 0.7072
## Detection Rate : 0.6460
## Detection Prevalence : 0.7921
## Balanced Accuracy : 0.7072
##
## 'Positive' Class : 0
##
#plot feature importance
cforestImpPlot <- function(x) {
cforest_importance <<- v <- varimp(x)
dotchart(v[order(v)])
}
importancePlot <- cforestImpPlot(crf)
importancePlot
## NULL
#GLMM
#tutorial https://aosmith.rbind.io/2020/08/20/simulate-binomial-glmm/
require(lme4)
mod = glm(effect_f ~ size.length.um.used.for.conversions + polymer + particle.volume.um3 + density.mg.um.3 + organism.group + bio.org + treatments + effect_f + exposure.duration.d + exposure.route + lvl1_f + dose.mg.L.master,
data = aoc_z,
family = binomial(link = "logit") )
mod
##
## Call: glm(formula = effect_f ~ size.length.um.used.for.conversions +
## polymer + particle.volume.um3 + density.mg.um.3 + organism.group +
## bio.org + treatments + effect_f + exposure.duration.d + exposure.route +
## lvl1_f + dose.mg.L.master, family = binomial(link = "logit"),
## data = aoc_z)
##
## Coefficients:
## (Intercept)
## 2.003e+00
## size.length.um.used.for.conversions
## 1.168e-04
## polymerNot Reported
## 1.946e-01
## polymerPA
## -9.836e-01
## polymerPE
## -5.899e-01
## polymerPET
## -9.957e-01
## polymerPMMA
## 1.376e+00
## polymerPP
## -4.278e-01
## polymerPS
## 1.663e-01
## polymerPVC
## 4.586e-01
## particle.volume.um3
## 9.749e-11
## density.mg.um.3
## -9.536e+08
## organism.groupAnnelida
## 6.760e-01
## organism.groupBacterium
## 6.240e-02
## organism.groupCnidaria
## 2.218e-01
## organism.groupCrustacea
## 8.432e-01
## organism.groupEchinoderm
## 1.932e-01
## organism.groupFish
## 1.706e-01
## organism.groupMixed
## -1.203e-01
## organism.groupMollusca
## -4.522e-01
## organism.groupNematoda
## 9.088e-01
## organism.groupPlant
## 1.037e+00
## organism.groupRotifera
## 1.372e+00
## bio.orgorganism
## -1.448e+00
## bio.orgpopulation
## -1.837e+00
## bio.orgsubcell
## -6.195e-01
## bio.orgtissue
## -1.003e+00
## treatments
## 5.067e-02
## exposure.duration.d
## 7.919e-03
## exposure.routemedia
## NA
## exposure.routesediment
## NA
## exposure.routewater
## -8.275e-01
## lvl1_fBehavioral, Sensory, Neurological
## 2.349e-02
## lvl1_fCirculatory, Respiratory
## -7.212e-01
## lvl1_fCommunity
## NA
## lvl1_fFitness
## -8.014e-01
## lvl1_fImmune
## -4.756e-01
## lvl1_fMetabolism
## -3.565e-01
## lvl1_fMicrobiome
## -1.775e-01
## lvl1_fStress
## -4.839e-01
## dose.mg.L.master
## -2.967e-08
##
## Degrees of Freedom: 4264 Total (i.e. Null); 4227 Residual
## (1785 observations deleted due to missingness)
## Null Deviance: 5341
## Residual Deviance: 4902 AIC: 4978
bin_glmm_fun = function(n_sites = 10,
b0 = 0,
b1 = 1.735,
num_samp = 50,
site_var = 0.5) {
site = rep(LETTERS[1:n_sites], each = 2)
plot = paste(site, rep(1:2, times = n_sites), sep = "." )
treatment = rep( c("treatment", "control"), times = n_sites)
dat = data.frame(site, plot, treatment)
site_eff = rep( rnorm(n = n_sites, mean = 0, sd = sqrt(site_var) ), each = 2)
log_odds = with(dat, b0 + b1*(treatment == "treatment") + site_eff)
prop = plogis(log_odds)
dat$num_samp = num_samp
dat$y = rbinom(n = n_sites*2, size = num_samp, prob = prop)
glmer(cbind(y, num_samp - y) ~ treatment + (1|site),
data = dat,
family = binomial(link = "logit") )
}
set.seed(16)
bin_glmm_fun()
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: cbind(y, num_samp - y) ~ treatment + (1 | site)
## Data: dat
## AIC BIC logLik deviance df.resid
## 122.6154 125.6025 -58.3077 116.6154 17
## Random effects:
## Groups Name Std.Dev.
## site (Intercept) 0.4719
## Number of obs: 20, groups: site, 10
## Fixed Effects:
## (Intercept) treatmenttreatment
## 0.1576 1.4859
library(quantregForest)
library(caret)
library(tidyverse)
library(tidymodels)
library(skimr)
library(sf)
library(ggspatial)
library(nhdplusTools)
library(patchwork)
library(Metrics)
library(gt)
aoc_z_2 <- aoc_z %>%
mutate_if(is.character, as.factor)# %>%
# mutate(effect_10 = case_when( #convert ordinal to numeric
# effect_f == "Yes" ~ 1,
# effect_f == "No" ~ 0))
#choose relevant predictors and log-transform
multiVar_small_log <- aoc_z_2 %>%
dplyr::select(size_f,
ID, #row number ID for split/joining by study
doi, #need to split studies
size.length.um.used.for.conversions, shape, polymer, particle.volume.um3, density.mg.um.3, organism.group, bio.org, treatments, effect_f, exposure.duration.d, exposure.route, lvl1_f, dose.mg.L.master) %>%
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
drop_na() %>% #drop missing
mutate(effect_10 = case_when( #convert ordinal to numeric
effect_f == "Y" ~ 1,
effect_f == "N" ~ 0
))# %>%
# select(-effect_f)
skim(multiVar_small_log)
| Name | multiVar_small_log |
| Number of rows | 4265 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| factor | 10 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| size_f | 0 | 1 | FALSE | 5 | 1µm: 2419, 100: 794, 1nm: 542, 100: 380 |
| ID | 0 | 1 | FALSE | 4265 | ID1: 1, ID1: 1, ID1: 1, ID1: 1 |
| doi | 0 | 1 | FALSE | 121 | 10.: 372, 10.: 144, 10.: 144, 10.: 139 |
| shape | 0 | 1 | FALSE | 3 | sph: 2763, fra: 1391, fib: 111, Not: 0 |
| polymer | 0 | 1 | FALSE | 9 | PS: 2311, PE: 1318, PP: 211, PET: 143 |
| organism.group | 0 | 1 | FALSE | 12 | Fis: 1449, Cru: 1092, Mol: 1032, Alg: 333 |
| bio.org | 0 | 1 | FALSE | 5 | sub: 1886, org: 1689, cel: 395, tis: 238 |
| effect_f | 0 | 1 | FALSE | 2 | N: 2904, Y: 1361 |
| exposure.route | 0 | 1 | FALSE | 4 | wat: 4151, foo: 63, med: 34, sed: 17 |
| lvl1_f | 0 | 1 | FALSE | 9 | Fit: 2033, Met: 1331, Beh: 293, Imm: 182 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| size.length.um.used.for.conversions | 0 | 1 | 0.89 | 1.26 | -4.00 | 0.30 | 1.00 | 1.84 | 3.70 | ▁▂▅▇▃ |
| particle.volume.um3 | 0 | 1 | 2.12 | 3.58 | -12.28 | 0.43 | 2.43 | 4.68 | 10.21 | ▁▂▅▇▃ |
| density.mg.um.3 | 0 | 1 | -8.98 | 0.04 | -9.06 | -9.01 | -8.97 | -8.97 | -8.85 | ▃▂▇▁▁ |
| treatments | 0 | 1 | 0.38 | 0.28 | 0.00 | 0.00 | 0.48 | 0.60 | 1.00 | ▇▃▆▇▁ |
| exposure.duration.d | 0 | 1 | 0.83 | 0.73 | -2.70 | 0.48 | 0.90 | 1.32 | 2.23 | ▁▁▁▇▅ |
| dose.mg.L.master | 0 | 1 | -0.31 | 2.06 | -11.64 | -1.49 | -0.30 | 1.00 | 8.17 | ▁▁▇▅▁ |
| effect_10 | 0 | 1 | 0.32 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
#no log transform for comparison
multiVar_small <- aoc_z %>%
dplyr::select(size_f, size.length.um.used.for.conversions, shape, polymer, particle.volume.um3, density.mg.um.3, organism.group, bio.org, treatments, effect_f, life.stage,
exposure.duration.d, lvl1_f, dose.mg.L.master, dose.particles.mL.master, dose.um3.mL.master)# %>%
#drop_na() #%>% #drop missing
# mutate(effect_10 = case_when( #convert ordinal to numeric
# effect == "Y" ~ 1,
# effect == "N" ~ 0
# )) %>%
# select(-effect)
#ensure completeness
skim(multiVar_small)
| Name | multiVar_small |
| Number of rows | 6050 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| size_f | 0 | 1.00 | FALSE | 6 | 1µm: 3241, 100: 1461, 1nm: 640, 100: 410 |
| shape | 0 | 1.00 | FALSE | 4 | sph: 3213, fra: 2410, Not: 267, fib: 160 |
| polymer | 0 | 1.00 | FALSE | 15 | PS: 2859, PE: 1815, PVC: 377, Not: 288 |
| organism.group | 0 | 1.00 | FALSE | 13 | Fis: 2125, Cru: 1415, Mol: 1279, Alg: 425 |
| bio.org | 0 | 1.00 | FALSE | 5 | org: 2574, sub: 2505, cel: 552, tis: 298 |
| effect_f | 75 | 0.99 | FALSE | 2 | N: 4198, Y: 1777 |
| life.stage | 0 | 1.00 | FALSE | 4 | Adu: 2559, Ear: 1804, Juv: 1108, Not: 579 |
| lvl1_f | 0 | 1.00 | FALSE | 9 | Fit: 2804, Met: 1807, Beh: 491, Imm: 303 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| size.length.um.used.for.conversions | 125 | 0.98 | 1.708200e+02 | 6.172300e+02 | 0 | 3.00 | 10.5 | 126.0 | 5.000000e+03 | ▇▁▁▁▁ |
| particle.volume.um3 | 289 | 0.95 | 2.540824e+08 | 1.818038e+09 | 0 | 10.47 | 523.6 | 261848.6 | 1.636246e+10 | ▇▁▁▁▁ |
| density.mg.um.3 | 293 | 0.95 | 0.000000e+00 | 0.000000e+00 | 0 | 0.00 | 0.0 | 0.0 | 0.000000e+00 | ▅▇▁▂▁ |
| treatments | 0 | 1.00 | 2.910000e+00 | 1.810000e+00 | 1 | 1.00 | 3.0 | 4.0 | 1.000000e+01 | ▇▆▂▁▁ |
| exposure.duration.d | 72 | 0.99 | 2.095000e+01 | 3.656000e+01 | 0 | 4.00 | 10.0 | 28.0 | 4.500000e+02 | ▇▁▁▁▁ |
| dose.mg.L.master | 1265 | 0.79 | 7.615795e+04 | 2.443881e+06 | 0 | 0.03 | 0.5 | 12.5 | 1.484403e+08 | ▇▁▁▁▁ |
| dose.particles.mL.master | 1561 | 0.74 | 1.113595e+18 | 3.952454e+19 | 0 | 14.00 | 1000.0 | 350777.9 | 2.280000e+21 | ▇▁▁▁▁ |
| dose.um3.mL.master | 1561 | 0.74 | 1.777625e+11 | 4.606684e+12 | 0 | 26231.62 | 934579.4 | 20746888.0 | 2.112000e+14 | ▇▁▁▁▁ |
##NON-LOG TRANSFORMED ### Missing Value Imputation by Rough Fix
#rough fix
multiVar_small_roughfix <- na.roughfix(multiVar_small)
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
multiVar_small_split <- multiVar_small_roughfix %>%
initial_split(prop = 0.75, strata = polymer) # splits data into training and testing set.
# default is 3/4ths split (but 75% training, 25% testing).
# Stratification (strata) = grouping training/testing sets by region, state, etc.
# Using the "strata" call ensures the number of data points in the training data is equivalent to the proportions in the original data set. (Strata below 10% of the total are pooled together.)
# Create a training data set with the training() function
# Pulls from training and testing sets created by initial_split()
multiVar_small_train <- training(multiVar_small_split)
multiVar_small_test <- testing(multiVar_small_split)
# Examine the environment to be sure # of observations looks like the 75/25 split. 3199:1066.
# Create a separate dataset of available IDs that were not used in the training dataset.
notTrain <- aoc %>% # all COMIDS from StreamCat data, sampled or not
filter(!ID %in% aoc$ID) # Removing sites used to train the model. n = 140,097
count_optimized <- paste0('n = ',nrow(multiVar_small_roughfix))
skim(multiVar_small_roughfix)
| Name | multiVar_small_roughfix |
| Number of rows | 6050 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| size_f | 0 | 1 | FALSE | 6 | 1µm: 3241, 100: 1461, 1nm: 640, 100: 410 |
| shape | 0 | 1 | FALSE | 4 | sph: 3213, fra: 2410, Not: 267, fib: 160 |
| polymer | 0 | 1 | FALSE | 15 | PS: 2859, PE: 1815, PVC: 377, Not: 288 |
| organism.group | 0 | 1 | FALSE | 13 | Fis: 2125, Cru: 1415, Mol: 1279, Alg: 425 |
| bio.org | 0 | 1 | FALSE | 5 | org: 2574, sub: 2505, cel: 552, tis: 298 |
| effect_f | 0 | 1 | FALSE | 2 | N: 4273, Y: 1777 |
| life.stage | 0 | 1 | FALSE | 4 | Adu: 2559, Ear: 1804, Juv: 1108, Not: 579 |
| lvl1_f | 0 | 1 | FALSE | 9 | Fit: 2804, Met: 1807, Beh: 491, Imm: 303 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| size.length.um.used.for.conversions | 0 | 1 | 1.675100e+02 | 6.112400e+02 | 0 | 3.00 | 10.5 | 111.70 | 5.000000e+03 | ▇▁▁▁▁ |
| particle.volume.um3 | 0 | 1 | 2.419453e+08 | 1.774904e+09 | 0 | 11.78 | 523.6 | 143793.31 | 1.636246e+10 | ▇▁▁▁▁ |
| density.mg.um.3 | 0 | 1 | 0.000000e+00 | 0.000000e+00 | 0 | 0.00 | 0.0 | 0.00 | 0.000000e+00 | ▃▇▁▁▁ |
| treatments | 0 | 1 | 2.910000e+00 | 1.810000e+00 | 1 | 1.00 | 3.0 | 4.00 | 1.000000e+01 | ▇▆▂▁▁ |
| exposure.duration.d | 0 | 1 | 2.082000e+01 | 3.636000e+01 | 0 | 4.00 | 10.0 | 28.00 | 4.500000e+02 | ▇▁▁▁▁ |
| dose.mg.L.master | 0 | 1 | 6.023412e+04 | 2.173592e+06 | 0 | 0.10 | 0.5 | 4.90 | 1.484403e+08 | ▇▁▁▁▁ |
| dose.particles.mL.master | 0 | 1 | 8.262695e+17 | 3.404834e+19 | 0 | 50.10 | 1000.0 | 15620.76 | 2.280000e+21 | ▇▁▁▁▁ |
| dose.um3.mL.master | 0 | 1 | 1.318971e+11 | 3.968775e+12 | 0 | 93457.94 | 934579.4 | 7462686.57 | 2.112000e+14 | ▇▁▁▁▁ |
# Step Three - Kitchen Sink model -----------------------------------------
# Random forest --
# a decision tree model, using predictors to answer dichotomous questions to create nested splits.
# no pruning happens - rather, multiple trees are built (the forest) and then you are looking for consensus across trees
# training data goes down the tree and ends up in a terminal node.
# if testing data goes down the same route, then this upholds our conclusions. Or, if it goes awry, this allows us to look for patterns in how it goes awry.
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
myrf <- randomForest(y = multiVar_small_roughfix$effect_f, # dependent variable
x = multiVar_small_roughfix %>%
dplyr::select(-effect_f), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
na.action = na.roughfix,
ntrees = 100) # 500 trees default.
myrf # examine the results.
##
## Call:
## randomForest(x = multiVar_small_roughfix %>% dplyr::select(-effect_f), y = multiVar_small_roughfix$effect_f, importance = T, proximity = T, na.action = na.roughfix, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 19.97%
## Confusion matrix:
## N Y class.error
## N 3848 425 0.09946174
## Y 783 994 0.44063028
~20% error rate. Let’s compare this with the same model with log-transformed values.
# Using caret to select the best predictors
# What are the parameters you want to use to run recursive feature elimination (rfe)?
my_ctrl <- rfeControl(functions = rfFuncs,
method = "cv",
verbose = FALSE,
returnResamp = "all")
# rfe = recursive feature elimination
# THIS STEP TAKES FOR-EV-ER!!!
set.seed(22)
my_rfe <- rfe(y = multiVar_small_roughfix$effect_f, # set dependent variable
x = multiVar_small_roughfix %>%
dplyr::select(-effect_f),
rfeControl = my_ctrl,
size = c(1:2, 4, 6, 8, 10, 12, 14, 15))#,
# na.action = na.roughfix())
# sets how many variables are in the overall model
# # I have 13 total possible variables, so I've chosen increments of 3 to look at.
# rfeControl = my_ctrl,
# na.action = na.roughfix,
# testX = multiVar_small_test %>% dplyr::select(-effect_f),
# testY = multiVar_small_test$effect_f)
# can you make your model even simpler?
# the following will pick a model with the smallest number of predictor variables based on the tolerance ("tol") that you specify (how much less than the best are you willing to tolerate?)
#inspect
my_rfe
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.7109 0.04366 0.006508 0.02098
## 2 0.7352 0.24393 0.012324 0.03795
## 4 0.7830 0.43883 0.010690 0.03355
## 6 0.7987 0.48547 0.010395 0.03278
## 8 0.8048 0.50390 0.009335 0.02513
## 10 0.8053 0.50767 0.011775 0.03293
## 12 0.8058 0.50979 0.011016 0.03207 *
## 14 0.8026 0.50041 0.012484 0.03555
## 15 0.8018 0.49671 0.011058 0.03184
##
## The top 5 variables (out of 12):
## organism.group, exposure.duration.d, lvl1_f, dose.mg.L.master, dose.um3.mL.master
trellis.par.set(caretTheme())
#visualize
plot(my_rfe,type = c("g", "o"))
my_size <- pickSizeTolerance(my_rfe$results, metric = "Accuracy", tol = 5, maximize = TRUE)
# higher tol (~10) gives you less variables
# lower tol (~1) gives you more variables - "I'd like the simplest model within 1% of the best model."
pickVars(my_rfe$variables, size = my_size)
## [1] "organism.group" "exposure.duration.d" "lvl1_f"
## [4] "dose.mg.L.master"
The pickSizeTolerance determines the absolute best value then the percent difference of the other points to this value. This approach can produce good results for many of the tree based models, such as random forest, where there is a plateau of good performance for larger subset sizes. For trees, this is usually because unimportant variables are infrequently used in splits and do not significantly affect performance.
Just use best predictors.
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
myrf_optimized <- randomForest(y = multiVar_small_train$effect_f, # dependent variable
x = multiVar_small_train %>%
dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master)), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
na.action = na.roughfix,
ntrees = 100) # 500 trees default.
myrf_optimized # examine the results.
##
## Call:
## randomForest(x = multiVar_small_train %>% dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master)), y = multiVar_small_train$effect_f, importance = T, proximity = T, na.action = na.roughfix, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.23%
## Confusion matrix:
## N Y class.error
## N 2849 359 0.1119077
## Y 695 635 0.5225564
varImpPlot(myrf_optimized)
optimized_fitted <- predict(myrf_optimized, multiVar_small_test %>% dplyr::select(-effect_f), OOB = TRUE, type ="response")
misClasificError_optimized <- mean(optimized_fitted != multiVar_small_test$effect_f)
accuracy_optimized <- paste0('Accuracy: ', round(100*(1 - misClasificError_optimized), 2), '%')
accuracy_optimized
## [1] "Accuracy: 80.03%"
df1 <- aoc_z %>%
dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master, effect_10)) %>%
drop_na()
m1 <- glm(effect_10 ~ organism.group + exposure.duration.d + lvl1_f + dose.um3.mL.master,
data = df1, na.action = na.omit, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = effect_10 ~ organism.group + exposure.duration.d +
## lvl1_f + dose.um3.mL.master, family = "binomial", data = df1,
## na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6477 -0.8702 -0.7715 1.2140 2.3209
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 9.313e-02 2.015e-01 0.462 0.64390
## organism.groupAnnelida 3.993e-01 5.046e-01 0.791 0.42882
## organism.groupBacterium 1.005e+00 4.189e-01 2.400 0.01639
## organism.groupCnidaria -1.914e+00 3.736e-01 -5.123 3.01e-07
## organism.groupCrustacea -3.697e-01 1.283e-01 -2.882 0.00396
## organism.groupEchinoderm -9.854e-01 3.324e-01 -2.964 0.00303
## organism.groupFish -7.221e-01 1.271e-01 -5.683 1.32e-08
## organism.groupMixed -1.598e+00 7.263e-01 -2.201 0.02775
## organism.groupMollusca -1.133e+00 1.280e-01 -8.848 < 2e-16
## organism.groupNematoda 8.204e-01 3.662e-01 2.241 0.02506
## organism.groupPlant -3.141e-01 3.598e-01 -0.873 0.38281
## organism.groupRotifera 2.186e-01 2.524e-01 0.866 0.38653
## exposure.duration.d 1.097e-02 1.789e-03 6.131 8.72e-10
## lvl1_fBehavioral, Sensory, Neurological -7.990e-02 2.060e-01 -0.388 0.69809
## lvl1_fCirculatory, Respiratory -5.358e-01 2.908e-01 -1.842 0.06545
## lvl1_fCommunity NA NA NA NA
## lvl1_fFitness -8.049e-01 1.799e-01 -4.473 7.71e-06
## lvl1_fImmune 1.672e-01 2.205e-01 0.759 0.44810
## lvl1_fMetabolism -3.949e-02 1.780e-01 -0.222 0.82440
## lvl1_fMicrobiome -1.226e-02 2.945e-01 -0.042 0.96681
## lvl1_fStress 5.007e-02 2.772e-01 0.181 0.85667
## dose.um3.mL.master 1.986e-14 9.563e-15 2.076 0.03785
##
## (Intercept)
## organism.groupAnnelida
## organism.groupBacterium *
## organism.groupCnidaria ***
## organism.groupCrustacea **
## organism.groupEchinoderm **
## organism.groupFish ***
## organism.groupMixed *
## organism.groupMollusca ***
## organism.groupNematoda *
## organism.groupPlant
## organism.groupRotifera
## exposure.duration.d ***
## lvl1_fBehavioral, Sensory, Neurological
## lvl1_fCirculatory, Respiratory .
## lvl1_fCommunity
## lvl1_fFitness ***
## lvl1_fImmune
## lvl1_fMetabolism
## lvl1_fMicrobiome
## lvl1_fStress
## dose.um3.mL.master *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5543.1 on 4375 degrees of freedom
## Residual deviance: 5289.9 on 4355 degrees of freedom
## AIC: 5331.9
##
## Number of Fisher Scoring iterations: 4
require(ggeffects)
m1_g <- ggeffects::ggpredict(m1, terms = "dose.um3.mL.master")
plot(m1_g)
### Missing Value Imputation by RF
The algorithm starts by imputing NAs using na.roughfix. Then randomForest is called with the completed data. The proximity matrix from the randomForest is used to update the imputation of the NAs. For continuous predictors, the imputed value is the weighted average of the non-missing obervations, where the weights are the proximities. For categorical predictors, the imputed value is the category with the largest average proximity. This process is iterated iter times.
Note: Imputation has not (yet) been implemented for the unsupervised case. Also, Breiman (2003) notes that the OOB estimate of error from randomForest tend to be optimistic when run on the data matrix with imputed values.
# impute values
#drop NA's in response
multiVar_small_noNa <- multiVar_small %>% drop_na(effect_f)
#impute
set.seed(111)
multiVar_small_rfImpute <- rfImpute(data = multiVar_small_noNa,
effect_f ~., #response value, cannot contain NA's
iter = 4,
ntree =75)
## ntree OOB 1 2
## 75: 19.18% 10.03% 40.80%
## ntree OOB 1 2
## 75: 19.43% 10.22% 41.19%
## ntree OOB 1 2
## 75: 19.80% 10.84% 40.97%
## ntree OOB 1 2
## 75: 19.82% 10.74% 41.25%
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
multiVar_small_split_imputed <- multiVar_small_rfImpute %>%
initial_split(prop = 0.75, strata = polymer) # splits data into training and testing set.
# default is 3/4ths split (but 75% training, 25% testing).
# Stratification (strata) = grouping training/testing sets by region, state, etc.
# Using the "strata" call ensures the number of data points in the training data is equivalent to the proportions in the original data set. (Strata below 10% of the total are pooled together.)
# Create a training data set with the training() function
# Pulls from training and testing sets created by initial_split()
multiVar_small_train_imputed <- training(multiVar_small_split_imputed)
multiVar_small_test_imputed <- testing(multiVar_small_split_imputed)
# Examine the environment to be sure # of observations looks like the 75/25 split. 3199:1066.
# Create a separate dataset of available IDs that were not used in the training dataset.
notTrain <- aoc %>% # all COMIDS from StreamCat data, sampled or not
filter(!ID %in% aoc$ID) # Removing sites used to train the model. n = 140,097
count_optimized_imputed <- paste0('n = ',nrow(multiVar_small_rfImpute))
skim(multiVar_small_rfImpute)
| Name | multiVar_small_rfImpute |
| Number of rows | 5975 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| factor | 8 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| effect_f | 0 | 1 | FALSE | 2 | N: 4198, Y: 1777 |
| size_f | 0 | 1 | FALSE | 6 | 1µm: 3181, 100: 1457, 1nm: 634, 100: 405 |
| shape | 0 | 1 | FALSE | 4 | sph: 3177, fra: 2374, Not: 267, fib: 157 |
| polymer | 0 | 1 | FALSE | 15 | PS: 2844, PE: 1787, PVC: 372, Not: 269 |
| organism.group | 0 | 1 | FALSE | 13 | Fis: 2125, Cru: 1358, Mol: 1277, Alg: 425 |
| bio.org | 0 | 1 | FALSE | 5 | sub: 2505, org: 2499, cel: 552, tis: 298 |
| life.stage | 0 | 1 | FALSE | 4 | Adu: 2550, Ear: 1743, Juv: 1103, Not: 579 |
| lvl1_f | 0 | 1 | FALSE | 9 | Fit: 2730, Met: 1807, Beh: 490, Imm: 303 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| size.length.um.used.for.conversions | 0 | 1 | 1.812200e+02 | 6.273400e+02 | 0 | 3.00 | 12.00 | 140.6 | 5.000000e+03 | ▇▁▁▁▁ |
| particle.volume.um3 | 0 | 1 | 2.789078e+08 | 1.833768e+09 | 0 | 10.47 | 523.60 | 366160.8 | 1.636246e+10 | ▇▁▁▁▁ |
| density.mg.um.3 | 0 | 1 | 0.000000e+00 | 0.000000e+00 | 0 | 0.00 | 0.00 | 0.0 | 0.000000e+00 | ▅▇▁▁▁ |
| treatments | 0 | 1 | 2.890000e+00 | 1.810000e+00 | 1 | 1.00 | 3.00 | 4.0 | 1.000000e+01 | ▇▆▂▁▁ |
| exposure.duration.d | 0 | 1 | 2.124000e+01 | 3.661000e+01 | 0 | 4.00 | 10.00 | 28.0 | 4.500000e+02 | ▇▁▁▁▁ |
| dose.mg.L.master | 0 | 1 | 1.061614e+05 | 2.272033e+06 | 0 | 0.10 | 1.89 | 100.0 | 1.484403e+08 | ▇▁▁▁▁ |
| dose.particles.mL.master | 0 | 1 | 1.270973e+18 | 3.518460e+19 | 0 | 50.10 | 10000.00 | 51932244.2 | 2.280000e+21 | ▇▁▁▁▁ |
| dose.um3.mL.master | 0 | 1 | 2.553182e+11 | 4.165520e+12 | 0 | 93457.94 | 5030933.69 | 186567164.2 | 2.112000e+14 | ▇▁▁▁▁ |
# Step Three - Kitchen Sink model -----------------------------------------
# Random forest --
# a decision tree model, using predictors to answer dichotomous questions to create nested splits.
# no pruning happens - rather, multiple trees are built (the forest) and then you are looking for consensus across trees
# training data goes down the tree and ends up in a terminal node.
# if testing data goes down the same route, then this upholds our conclusions. Or, if it goes awry, this allows us to look for patterns in how it goes awry.
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
myrf <- randomForest(y = multiVar_small_rfImpute$effect_f, # dependent variable
x = multiVar_small_rfImpute %>%
dplyr::select(-effect_f), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
myrf # examine the results.
##
## Call:
## randomForest(x = multiVar_small_rfImpute %>% dplyr::select(-effect_f), y = multiVar_small_rfImpute$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 19.97%
## Confusion matrix:
## N Y class.error
## N 3762 436 0.1038590
## Y 757 1020 0.4259989
~20% error rate. Let’s compare this with the same model with log-transformed values.
plot(myrf, log = "y", main = "Random Forest for Imputed Data")
# Using caret to select the best predictors
# What are the parameters you want to use to run recursive feature elimination (rfe)?
my_ctrl <- rfeControl(functions = rfFuncs,
method = "cv",
verbose = FALSE,
returnResamp = "all")
# rfe = recursive feature elimination
# THIS STEP TAKES FOR-EV-ER!!!
set.seed(22)
my_rfe_imputed <- rfe(y = multiVar_small_rfImpute$effect_f, # set dependent variable
x = multiVar_small_rfImpute %>%
dplyr::select(-effect_f),
rfeControl = my_ctrl,
size = c(1:2, 4, 6, 8, 10, 12, 14, 15))#,
# na.action = na.rfImpute())
# sets how many variables are in the overall model
# # I have 13 total possible variables, so I've chosen increments of 3 to look at.
# rfeControl = my_ctrl,
# na.action = na.rfImpute,
# testX = multiVar_small_test %>% dplyr::select(-effect_f),
# testY = multiVar_small_test$effect_f)
# can you make your model even simpler?
# the following will pick a model with the smallest number of predictor variables based on the tolerance ("tol") that you specify (how much less than the best are you willing to tolerate?)
#inspect
my_rfe_imputed
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.7076 0.04685 0.007185 0.02343
## 2 0.7324 0.24895 0.014505 0.03675
## 4 0.7878 0.46375 0.011796 0.03070
## 6 0.7913 0.47426 0.014692 0.03830
## 8 0.7975 0.49153 0.013654 0.03563
## 10 0.7988 0.49834 0.009971 0.02543
## 12 0.7995 0.49814 0.010290 0.02695
## 14 0.8000 0.49857 0.011321 0.02768
## 15 0.8029 0.50511 0.011584 0.02876 *
##
## The top 5 variables (out of 15):
## organism.group, exposure.duration.d, dose.um3.mL.master, lvl1_f, dose.particles.mL.master
trellis.par.set(caretTheme())
#visualize
plot(my_rfe_imputed,type = c("g", "o"))
my_size <- pickSizeTolerance(my_rfe_imputed$results, metric = "Accuracy", tol = 5, maximize = TRUE)
# higher tol (~10) gives you less variables
# lower tol (~1) gives you more variables - "I'd like the simplest model within 1% of the best model."
pickVars(my_rfe_imputed$variables, size = my_size)
## [1] "organism.group" "exposure.duration.d" "dose.um3.mL.master"
## [4] "lvl1_f"
The pickSizeTolerance determines the absolute best value then the percent difference of the other points to this value. This approach can produce good results for many of the tree based models, such as random forest, where there is a plateau of good performance for larger subset sizes. For trees, this is usually because unimportant variables are infrequently used in splits and do not significantly affect performance.
Just use best predictors.
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
myrf_optimized_imputed <- randomForest(y = multiVar_small_train$effect_f, # dependent variable
x = multiVar_small_train %>%
dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master)), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
myrf_optimized_imputed # examine the results.
##
## Call:
## randomForest(x = multiVar_small_train %>% dplyr::select(c(organism.group, exposure.duration.d, lvl1_f, dose.um3.mL.master)), y = multiVar_small_train$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.23%
## Confusion matrix:
## N Y class.error
## N 2849 359 0.1119077
## Y 695 635 0.5225564
varImpPlot(myrf_optimized_imputed)
optimized_imputed_fitted <- predict(myrf_optimized_imputed, multiVar_small_test_imputed %>% dplyr::select(-effect_f), OOB = TRUE, type ="response")
misClasificError_optimized_imputed <- mean(optimized_imputed_fitted != multiVar_small_test_imputed$effect_f)
accuracy_optimized_imputed <- paste0('Accuracy: ', round(100*(1 - misClasificError_optimized_imputed), 2), '%')
accuracy_optimized_imputed
## [1] "Accuracy: 80.44%"
####LOG DATA
#log subset
## First split data by DOI, then re-join other data
set.seed(4)
doi_split <- multiVar_small_log %>%
dplyr::select(doi) %>%
unique() %>%
initial_split(prop = 0.65)
#split just by doi
doi_train <- training(doi_split)
doi_test <- testing(doi_split)
train_full <- left_join(doi_train, multiVar_small_log, by = "doi") %>%
dplyr::select(-c(doi, ID, effect_10)) %>%
droplevels()
test_full <- left_join(doi_test, multiVar_small_log, by = "doi") %>%
dplyr::select(-c(doi, ID, effect_10)) %>%
droplevels()
#inspect proportion in test and train
nrow(test_full)
## [1] 921
nrow(train_full)
## [1] 3344
Now that we’ve split the data by study and ensured a good proportion in test and train, let’s run the model.
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
myrf_log <- randomForest(y = train_full$effect_f, # dependent variable
x = train_full %>%
dplyr::select(-effect_f), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
myrf_log # examine the results.
##
## Call:
## randomForest(x = train_full %>% dplyr::select(-effect_f), y = train_full$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 20.66%
## Confusion matrix:
## N Y class.error
## N 1980 254 0.1136974
## Y 437 673 0.3936937
No performance enhancement with log-transformed values.
Repeat with continous variables whenever possible, and max of 8 predictors.
skim(multiVar)
| Name | multiVar |
| Number of rows | 5925 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| factor | 16 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| size_f | 0 | 1.00 | FALSE | 5 | 1µm: 3241, 100: 1461, 1nm: 640, 100: 410 |
| shape | 0 | 1.00 | FALSE | 4 | sph: 3213, fra: 2388, Not: 164, fib: 160 |
| polymer | 0 | 1.00 | FALSE | 14 | PS: 2858, PE: 1728, PVC: 374, PP: 267 |
| organism.group | 0 | 1.00 | FALSE | 13 | Fis: 2041, Cru: 1393, Mol: 1277, Alg: 425 |
| environment | 0 | 1.00 | FALSE | 3 | Mar: 3198, Fre: 2547, Ter: 180 |
| bio.org | 0 | 1.00 | FALSE | 5 | org: 2527, sub: 2427, cel: 552, tis: 298 |
| effect | 75 | 0.99 | FALSE | 2 | N: 4137, Y: 1713 |
| effect_f | 75 | 0.99 | FALSE | 2 | N: 4137, Y: 1713 |
| exposure.route | 129 | 0.98 | FALSE | 7 | wat: 4498, foo: 697, sed: 531, med: 40 |
| lvl1_f | 0 | 1.00 | FALSE | 9 | Fit: 2767, Met: 1766, Beh: 486, Imm: 297 |
| lvl2_f | 0 | 1.00 | FALSE | 45 | Oxi: 881, Mor: 804, Rep: 581, Gro: 481 |
| sex | 5060 | 0.15 | FALSE | 3 | F: 439, M: 414, M,F: 12 |
| effect.metric | 2568 | 0.57 | FALSE | 8 | HON: 1877, LOE: 1020, NOE: 385, LC5: 42 |
| functional.group | 5547 | 0.06 | FALSE | 3 | COO: 201, NH2: 154, SO3: 23 |
| charge | 5223 | 0.12 | FALSE | 2 | neg: 537, pos: 165 |
| acute.chronic_f | 0 | 1.00 | FALSE | 3 | Acu: 2893, Chr: 2243, Una: 789 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| size.length.um.used.for.conversions | 0 | 1.00 | 1.708200e+02 | 6.172300e+02 | 0.00 | 3.00 | 10.50 | 126.00 | 5.000000e+03 | ▇▁▁▁▁ |
| particle.volume.um3 | 164 | 0.97 | 2.540824e+08 | 1.818038e+09 | 0.00 | 10.47 | 523.60 | 261848.61 | 1.636246e+10 | ▇▁▁▁▁ |
| density.mg.um.3 | 262 | 0.96 | 0.000000e+00 | 0.000000e+00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.000000e+00 | ▅▇▁▂▁ |
| treatments | 0 | 1.00 | 2.930000e+00 | 1.820000e+00 | 1.00 | 1.00 | 3.00 | 4.00 | 1.000000e+01 | ▇▆▂▁▁ |
| effect_10 | 75 | 0.99 | 2.900000e-01 | 4.600000e-01 | 0.00 | 0.00 | 0.00 | 1.00 | 1.000000e+00 | ▇▁▁▁▃ |
| exposure.duration.d | 72 | 0.99 | 2.107000e+01 | 3.692000e+01 | 0.00 | 4.00 | 10.00 | 28.00 | 4.500000e+02 | ▇▁▁▁▁ |
| media.temp | 868 | 0.85 | 2.131000e+01 | 4.590000e+00 | 7.50 | 18.00 | 20.00 | 25.00 | 2.900000e+01 | ▁▃▇▃▇ |
| dose.mg.L.master | 1224 | 0.79 | 7.751876e+04 | 2.465602e+06 | 0.00 | 0.03 | 0.50 | 18.36 | 1.484403e+08 | ▇▁▁▁▁ |
| media.ph | 3826 | 0.35 | 7.540000e+00 | 5.800000e-01 | 6.00 | 7.20 | 7.60 | 8.00 | 8.500000e+00 | ▂▂▅▇▅ |
| media.sal.ppt | 4113 | 0.31 | 2.882000e+01 | 8.170000e+00 | 0.06 | 28.00 | 32.00 | 34.00 | 3.840000e+01 | ▁▁▁▃▇ |
| dose.particles.mL.master | 1436 | 0.76 | 1.113595e+18 | 3.952454e+19 | 0.00 | 14.00 | 1000.00 | 350777.93 | 2.280000e+21 | ▇▁▁▁▁ |
| zetapotential.mV | 5244 | 0.11 | -1.837000e+01 | 2.767000e+01 | -87.06 | -30.90 | -24.70 | -5.10 | 1.060000e+02 | ▂▇▅▁▁ |
| max.size.ingest.mm | 4800 | 0.19 | 1.900000e-01 | 1.400000e-01 | 0.04 | 0.11 | 0.11 | 0.40 | 4.000000e-01 | ▅▇▁▁▆ |
| dose.mg.L.master.AF.noec | 3253 | 0.45 | 1.062252e+05 | 2.977044e+06 | 0.00 | 0.08 | 0.70 | 20.00 | 1.484403e+08 | ▇▁▁▁▁ |
| dose.particles.mL.master.AF.noec | 3387 | 0.43 | 4.015210e+17 | 1.285796e+19 | 0.00 | 39.77 | 2000.00 | 285586.44 | 5.720000e+20 | ▇▁▁▁▁ |
| effect.score | 0 | 1.00 | 2.230000e+00 | 1.770000e+00 | 1.00 | 1.00 | 1.00 | 3.00 | 6.000000e+00 | ▇▂▁▁▂ |
acute <- aoc_z %>%
filter(acute.chronic_f == "Acute") %>%
filter(!environment == "Terrestrial") %>%
filter(bio.org == "organism") %>%
filter(lvl1_f == "Fitness") %>%
filter(!exposure.route == "food") %>%
filter(!polymer == "Not Reported") %>%
filter(!shape == "Not Reported") %>%
dplyr::select(c(doi, dose.mg.L.master, organism.group, size.length.um.used.for.conversions, polymer, shape, effect.score, dose.particles.mL.master, effect_f)) %>%
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
drop_na() %>%
droplevels()
count_acute <- paste0('n = ',nrow(acute))
skim(acute)
| Name | acute |
| Number of rows | 840 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| factor | 5 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| doi | 0 | 1 | FALSE | 49 | 10.: 103, 10.: 96, doi: 64, 10.: 60 |
| organism.group | 0 | 1 | FALSE | 3 | Cru: 449, Mol: 211, Fis: 180 |
| polymer | 0 | 1 | FALSE | 6 | PS: 382, PE: 339, PET: 56, PP: 53 |
| shape | 0 | 1 | FALSE | 3 | sph: 477, fra: 281, fib: 82 |
| effect_f | 0 | 1 | FALSE | 2 | N: 665, Y: 175 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dose.mg.L.master | 0 | 1 | 0.37 | 2.07 | -11.64 | -0.74 | 0.28 | 1.40 | 8.17 | ▁▁▇▇▁ |
| size.length.um.used.for.conversions | 0 | 1 | 0.80 | 1.42 | -1.46 | -0.30 | 0.74 | 1.74 | 3.70 | ▅▆▇▅▃ |
| effect.score | 0 | 1 | 0.63 | 0.14 | 0.48 | 0.48 | 0.60 | 0.78 | 0.78 | ▇▁▂▁▇ |
| dose.particles.mL.master | 0 | 1 | 4.52 | 3.75 | -4.20 | 1.70 | 4.04 | 7.25 | 12.65 | ▂▇▇▅▃ |
With a complete dataset for just acute studies in aquatic organisms with aqueous route of exposure, we are left with 453 complete cases with 6 predictor variables, 1 response variable (effect y/n), and an ID. Let’s determine if we have enough data for ML.
exp(1)^6
## [1] 403.4288
\(e^6\) is less than n (453), so we may proceed.
## First split data by DOI, then re-join other data
set.seed(4)
acute_doi_split <- acute %>%
dplyr::select(doi) %>%
unique() %>%
initial_split(prop = 0.65)
#split just by doi
acute_doi_train <- training(acute_doi_split)
acute_doi_test <- testing(acute_doi_split)
acute_train <- left_join(acute_doi_train, acute, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
acute_test <- left_join(acute_doi_test, acute, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
#inspect proportion in test and train
nrow(acute_test)
## [1] 192
nrow(acute_train)
## [1] 648
# Create calibration and validation splits with tidymodels initial_split() function.
# set.seed(4)
# acute_split <- acute %>%
# initial_split(prop = 0.75, strata = polymer) # splits data into training and testing set.
# # default is 3/4ths split (but 75% training, 25% testing).
# # Stratification (strata) = grouping training/testing sets by region, state, etc.
# # Using the "strata" call ensures the number of data points in the training data is equivalent to the proportions in the original data set. (Strata below 10% of the total are pooled together.)
#
# # Create a training data set with the training() function
# # Pulls from training and testing sets created by initial_split()
# acute_train <- training(acute_split)
# acute_test <- testing(acute_split)
# # Examine the environment to be sure # of observations looks like the 75/25 split. 3199:1066.
# Random forest --
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
acuterf <- randomForest(y = acute_train$effect_f, # dependent variable
x = acute_train %>%
dplyr::select(-effect_f), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
acuterf # examine the results.
##
## Call:
## randomForest(x = acute_train %>% dplyr::select(-effect_f), y = acute_train$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 15.59%
## Confusion matrix:
## N Y class.error
## N 476 32 0.06299213
## Y 69 71 0.49285714
plot(acuterf)
# model performance appears to improve most at ~75 trees
varImpPlot(acuterf)
# displays which variables are most important
# helps to winnow down list of predictors
# recommended to weigh left pane more
# right pane also shows how evenly things split based on the list of predictors
# values close to 0 can be dropped, but don't have to be
Dose, organism group, exposure duration are important.
importance <- acuterf$importance
#View(importance)
# displays the data plotted in the plot above
# fix levels
levels(acute_test$polymer) <- levels(acute_train$polymer)
levels(acute_test$shape) <- levels(acute_train$shape)
levels(acute_test$organism.group) <- levels(acute_train$organism.group)
fitted <- predict(acuterf,
newdata = acute_test %>% dplyr::select(-effect_f),
OOB = TRUE, type ="response")
misClasificError_acute <- mean(fitted != acute_test$effect_f)
accuracy_acute <- paste0('Accuracy: ', round(100*(1 - misClasificError_acute), 2), '%')
accuracy_acute
## [1] "Accuracy: 77.08%"
Alternative ROC Curve
require(pROC)
predicted <- predict(acuterf, acute_test %>% dplyr::select(-effect_f),
OOB=TRUE, type= "response")
#Calculate ROC curve
rocCurve.tree <- roc(as.numeric(acute_test$effect_f),as.numeric(predicted))
##gplot
# rocks <- roc()
#plot the ROC curve
plot(rocCurve.tree,col=c(4))
#### Chronic
chronic <- aoc_z %>%
filter(acute.chronic_f == "Chronic") %>%
filter(!environment == "Terrestrial") %>%
filter(bio.org == "organism") %>%
filter(lvl1_f == "Fitness") %>%
filter(!exposure.route == "food") %>%
dplyr::select(c(doi, dose.mg.L.master, organism.group, size.length.um.used.for.conversions, polymer, shape, effect.score, dose.particles.mL.master, effect_f)) %>%
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
drop_na() %>%
droplevels()
count_chronic <- paste0('n = ',nrow(chronic))
skim(chronic)
| Name | chronic |
| Number of rows | 428 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| factor | 5 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| doi | 0 | 1 | FALSE | 24 | 10.: 96, doi: 48, 10.: 44, 10.: 36 |
| organism.group | 0 | 1 | FALSE | 3 | Cru: 368, Mol: 35, Fis: 25 |
| polymer | 0 | 1 | FALSE | 10 | PS: 191, PE: 80, Not: 73, PET: 30 |
| shape | 0 | 1 | FALSE | 3 | sph: 271, fra: 156, fib: 1 |
| effect_f | 0 | 1 | FALSE | 2 | N: 357, Y: 71 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dose.mg.L.master | 0 | 1 | -0.45 | 1.56 | -3.62 | -1.64 | -0.54 | 0.60 | 3.33 | ▃▆▇▅▂ |
| size.length.um.used.for.conversions | 0 | 1 | 0.61 | 1.01 | -1.15 | 0.37 | 0.74 | 1.00 | 3.18 | ▅▆▇▅▁ |
| effect.score | 0 | 1 | 0.60 | 0.11 | 0.48 | 0.48 | 0.60 | 0.60 | 0.78 | ▆▁▇▁▃ |
| dose.particles.mL.master | 0 | 1 | 4.34 | 2.98 | -4.20 | 2.58 | 4.00 | 5.14 | 11.89 | ▁▂▇▁▂ |
With a complete dataset for just acute studies in aquatic organisms with aqueous route of exposure, we are left with 453 complete cases with 6 predictor variables, 1 response variable (effect y/n), and an ID. Let’s determine if we have enough data for ML.
exp(1)^6
## [1] 403.4288
\(e^6\) is less than n (453), so we may proceed.
## First split data by DOI, then re-join other data
set.seed(4)
chronic_doi_split <- chronic %>%
dplyr::select(doi) %>%
unique() %>%
initial_split(prop = 0.85)
#split just by doi
chronic_doi_train <- training(chronic_doi_split)
chronic_doi_test <- testing(chronic_doi_split)
chronic_train <- left_join(chronic_doi_train, chronic, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
chronic_test <- left_join(chronic_doi_test, chronic, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
#inspect proportion in test and train
nrow(chronic_test)
## [1] 116
nrow(chronic_train)
## [1] 312
# # Create calibration and validation splits with tidymodels initial_split() function.
# set.seed(4)
# chronic_split <- chronic %>%
# initial_split(prop = 0.75, strata = polymer) # splits data into training and testing set.
# # default is 3/4ths split (but 75% training, 25% testing).
# # Stratification (strata) = grouping training/testing sets by region, state, etc.
# # Using the "strata" call ensures the number of data points in the training data is equivalent to the proportions in the original data set. (Strata below 10% of the total are pooled together.)
#
# # Create a training data set with the training() function
# # Pulls from training and testing sets created by initial_split()
# chronic_train <- training(chronic_split)
# chronic_test <- testing(chronic_split)
# # Examine the environment to be sure # of observations looks like the 75/25 split. 3199:1066.
# Random forest --
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
chronicrf <- randomForest(y = chronic_train$effect_f, # dependent variable
x = chronic_train %>%
dplyr::select(-effect_f), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
chronicrf # examine the results.
##
## Call:
## randomForest(x = chronic_train %>% dplyr::select(-effect_f), y = chronic_train$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 20.19%
## Confusion matrix:
## N Y class.error
## N 245 11 0.04296875
## Y 52 4 0.92857143
plot(chronicrf)
# model performance appears to improve most at ~75 trees
varImpPlot(chronicrf)
# displays which variables are most important
# helps to winnow down list of predictors
# recommended to weigh left pane more
# right pane also shows how evenly things split based on the list of predictors
# values close to 0 can be dropped, but don't have to be
Dose, organism group, exposure duration are important.
importance <- chronicrf$importance
#View(importance)
# displays the data plotted in the plot above
# fix levels
levels(chronic_test$polymer) <- levels(chronic_train$polymer)
levels(chronic_test$shape) <- levels(chronic_train$shape)
levels(chronic_test$organism.group) <- levels(chronic_train$organism.group)
fitted <- predict(chronicrf, chronic_test %>%
dplyr::select(-effect_f),
OOB = TRUE, type ="response")
misClasificError_chronic <- mean(fitted != chronic_test$effect_f)
accuracy_chronic <- paste0('Accuracy: ', round(100*(1 - misClasificError_chronic), 2), '%')
accuracy_chronic
## [1] "Accuracy: 81.9%"
all <- aoc_z %>%
#filter(acute.chronic_f == "Chronic") %>%
filter(!environment == "Terrestrial") %>%
filter(bio.org == "organism") %>%
filter(lvl1_f == "Fitness") %>%
filter(!exposure.route == "food") %>%
dplyr::select(c(size.length.um.used.for.conversions, shape, polymer, particle.volume.um3, organism.group, dose.mg.L.master, effect.score, lvl2_f, effect_f, doi)) %>%
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
drop_na()
count_all <- paste0('n = ',nrow(all))
skim(all)
| Name | all |
| Number of rows | 1510 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| factor | 6 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| shape | 0 | 1 | FALSE | 3 | sph: 848, fra: 579, fib: 83, Not: 0 |
| polymer | 0 | 1 | FALSE | 11 | PS: 623, PE: 524, Not: 116, PET: 86 |
| organism.group | 0 | 1 | FALSE | 8 | Cru: 878, Mol: 246, Fis: 211, Ech: 63 |
| lvl2_f | 0 | 1 | FALSE | 5 | Mor: 574, Rep: 338, Dev: 295, Gro: 222 |
| effect_f | 0 | 1 | FALSE | 2 | N: 1199, Y: 311 |
| doi | 0 | 1 | FALSE | 83 | 10.: 144, 10.: 136, 10.: 103, doi: 72 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| size.length.um.used.for.conversions | 0 | 1 | 0.79 | 1.25 | -1.46 | 0.30 | 0.74 | 1.77 | 3.70 | ▅▆▇▅▂ |
| particle.volume.um3 | 0 | 1 | 1.76 | 3.54 | -4.67 | 0.17 | 1.15 | 4.24 | 10.21 | ▃▇▆▃▂ |
| dose.mg.L.master | 0 | 1 | 0.14 | 1.94 | -11.64 | -0.91 | 0.10 | 1.40 | 8.17 | ▁▁▇▇▁ |
| effect.score | 0 | 1 | 0.62 | 0.13 | 0.48 | 0.48 | 0.60 | 0.78 | 0.78 | ▇▁▅▁▇ |
With a complete dataset for just organismal fitness studies in aquatic organisms with aqueous route of exposure, we are left with 1510 complete cases with 8 predictor variables, and 1 response variable (effect y/n). Let’s determine if we have enough data for ML.
exp(1)^7
## [1] 1096.633
\(e^6\) is less than n (453), so we may proceed.
## First split data by DOI, then re-join other data
set.seed(4)
all_doi_split <- all %>%
dplyr::select(doi) %>%
unique() %>%
initial_split(prop = 0.7)
#split just by doi
all_doi_train <- training(all_doi_split)
all_doi_test <- testing(all_doi_split)
all_train <- left_join(all_doi_train, all, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
all_test <- left_join(all_doi_test, all, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
#inspect proportion in test and train
nrow(all_test)
## [1] 296
nrow(all_train)
## [1] 1214
# Create calibration and validation splits with tidymodels initial_split() function.
set.seed(4)
# all_split <- all %>%
# initial_split(prop = 0.75, strata = polymer) # splits data into training and testing set.
# # default is 3/4ths split (but 75% training, 25% testing).
# # Stratification (strata) = grouping training/testing sets by region, state, etc.
# # Using the "strata" call ensures the number of data points in the training data is equivalent to the proportions in the original data set. (Strata below 10% of the total are pooled together.)
#
# # Create a training data set with the training() function
# # Pulls from training and testing sets created by initial_split()
# all_train <- training(all_split)
# all_test <- testing(all_split)
# # Examine the environment to be sure # of observations looks like the 75/25 split. 3199:1066.
# Random forest --
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
allrf <- randomForest(y = all_train$effect_f, # dependent variable
x = all_train %>%
dplyr::select(-effect_f), # selecting all predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
allrf # examine the results.
##
## Call:
## randomForest(x = all_train %>% dplyr::select(-effect_f), y = all_train$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 17.79%
## Confusion matrix:
## N Y class.error
## N 925 42 0.0434333
## Y 174 73 0.7044534
plot(allrf)
# model performance appears to improve most at ~75 trees
varImpPlot(allrf)
# displays which variables are most important
# helps to winnow down list of predictors
# recommended to weigh left pane more
# right pane also shows how evenly things split based on the list of predictors
# values close to 0 can be dropped, but don't have to be
Dose, organism group, exposure duration are important.
importance <- allrf$importance
#View(importance)
# displays the data plotted in the plot above
levels(all_test$polymer) <- levels(all_train$polymer)
levels(all_test$shape) <- levels(all_train$shape)
levels(all_test$organism.group) <- levels(all_train$organism.group)
fitted <- predict(allrf, all_test %>%
dplyr::select(-effect_f),
OOB = TRUE, type ="response")
misClasificError_all <- mean(fitted != all_test$effect_f)
accuracy_all <- paste0('Accuracy: ', round(100*(1 - misClasificError_all), 2), '%')
accuracy_all
## [1] "Accuracy: 75.34%"
nofilter <- aoc_z %>%
#filter(acute.chronic_f == "Chronic") %>%
##filter(!environment == "Terrestrial") %>%
#filter(bio.org == "organism") %>%
#filter(lvl1_f == "Fitness") %>%
#filter(!exposure.route == "food") %>%
filter(!shape == "Not Reported") %>%
filter(!polymer == "Not Reported") %>%
dplyr::select(c(doi, dose.mg.L.master, organism.group, size.length.um.used.for.conversions, polymer, shape,
#effect.score,
dose.particles.mL.master,
effect_f,
particle.volume.um3,
exposure.duration.d,
lvl1_f)) %>%
droplevels() %>%
mutate_if(~is.numeric(.) && (.) > 0, log10) %>%
drop_na()
count_nofilter <- paste0('n = ',nrow(nofilter))
skim(nofilter)
| Name | nofilter |
| Number of rows | 4266 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 6 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| doi | 0 | 1 | FALSE | 122 | 10.: 372, 10.: 144, 10.: 139, 10.: 126 |
| organism.group | 0 | 1 | FALSE | 12 | Fis: 1461, Mol: 1050, Cru: 1006, Alg: 393 |
| polymer | 0 | 1 | FALSE | 10 | PS: 2368, PE: 1288, PP: 241, PVC: 170 |
| shape | 0 | 1 | FALSE | 3 | sph: 2710, fra: 1445, fib: 111 |
| effect_f | 0 | 1 | FALSE | 2 | N: 2858, Y: 1408 |
| lvl1_f | 0 | 1 | FALSE | 9 | Fit: 1962, Met: 1379, Beh: 293, Imm: 203 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| dose.mg.L.master | 0 | 1 | -0.21 | 2.07 | -11.64 | -1.47 | -0.30 | 1.26 | 8.17 | ▁▁▇▅▁ |
| size.length.um.used.for.conversions | 0 | 1 | 0.90 | 1.27 | -4.00 | 0.24 | 1.00 | 1.85 | 3.70 | ▁▂▅▇▃ |
| dose.particles.mL.master | 0 | 1 | 3.74 | 3.70 | -4.20 | 1.15 | 3.00 | 5.76 | 21.36 | ▃▇▃▁▁ |
| particle.volume.um3 | 0 | 1 | 2.14 | 3.59 | -12.28 | 0.17 | 2.67 | 4.82 | 10.21 | ▁▂▅▇▂ |
| exposure.duration.d | 0 | 1 | 0.81 | 0.76 | -2.70 | 0.48 | 0.90 | 1.32 | 2.23 | ▁▁▁▇▅ |
With a complete dataset for just acute studies in aquatic organisms with aqueous route of exposure, we are left with 453 complete cases with 6 predictor variables, 1 response variable (effect y/n), and an ID. Let’s determine if we have enough data for ML.
exp(1)^8
## [1] 2980.958
\(e^6\) is less than n (453), so we may proceed.
## First split data by DOI, then re-join other data
set.seed(4)
nofilter_doi_split <- nofilter %>%
dplyr::select(doi) %>%
unique() %>%
initial_split(prop = 0.84)
#split just by doi
nofilter_doi_train <- training(nofilter_doi_split)
nofilter_doi_test <- testing(nofilter_doi_split)
nofilter_train <- left_join(nofilter_doi_train, nofilter, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
nofilter_test <- left_join(nofilter_doi_test, nofilter, by = "doi") %>%
dplyr::select(-doi) %>%
droplevels()
#inspect proportion in test and train
nrow(nofilter_test)
## [1] 838
nrow(nofilter_train)
## [1] 3428
# Create calibration and validation splits with tidymodels initial_split() function.
# set.seed(4)
# nofilter_split <- nofilter %>%
# initial_split(prop = 0.75, strata = polymer) # splits data into training and testing set.
# # default is 3/4ths split (but 75% training, 25% testing).
# # Stratification (strata) = grouping training/testing sets by region, state, etc.
# # Using the "strata" cnofilter ensures the number of data points in the training data is equivalent to the proportions in the original data set. (Strata below 10% of the total are pooled together.)
#
# # Create a training data set with the training() function
# # Pulls from training and testing sets created by initial_split()
# nofilter_train <- training(nofilter_split)
# nofilter_test <- testing(nofilter_split)
# # Examine the environment to be sure # of observations looks like the 75/25 split. 3199:1066.
# Random forest --
set.seed(2) # assures the data pulled is random, but sets it for the run below (makes outcome stable)
nofilterrf <- randomForest(y = nofilter_train$effect_f, # dependent variable
x = nofilter_train %>%
dplyr::select(-effect_f), # selecting nofilter predictor variables
importance = T, # how useful is a predictor in predicting values (nothing causal)
proximity = T,
ntrees = 100) # 500 trees default.
nofilterrf # examine the results.
##
## Call:
## randomForest(x = nofilter_train %>% dplyr::select(-effect_f), y = nofilter_train$effect_f, importance = T, proximity = T, ntrees = 100)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 21.62%
## Confusion matrix:
## N Y class.error
## N 2076 272 0.1158433
## Y 469 611 0.4342593
plot(nofilterrf)
# model performance appears to improve most at ~75 trees
varImpPlot(nofilterrf)
# displays which variables are most important
# helps to winnow down list of predictors
# recommended to weigh left pane more
# right pane also shows how evenly things split based on the list of predictors
# values close to 0 can be dropped, but don't have to be
Dose, organism group, exposure duration are important.
importance <- nofilterrf$importance
#View(importance)
# displays the data plotted in the plot above
#fix levels
levels(nofilter_test$polymer) <- levels(nofilter_train$polymer)
levels(nofilter_test$shape) <- levels(nofilter_train$shape)
levels(nofilter_test$lvl1_f) <- levels(nofilter_train$lvl1_f)
levels(nofilter_test$organism.group) <- levels(nofilter_train$organism.group)
x1 <- nofilter_test %>% dplyr::select(-effect_f)
fitted <- predict(nofilterrf,
x1,
OOB = TRUE, type ="response")
misClasificError_nofilter <- mean(fitted != nofilter_test$effect_f)
accuracy_nofilter <- paste0('Accuracy: ', round(100*(1 - misClasificError_nofilter), 2), '%')
accuracy_nofilter
## [1] "Accuracy: 50.48%"
Prep for plotting.
###CHRONIC
chronicpredictions <- as.data.frame(predict(chronicrf, chronic_test %>% dplyr::select(-effect_f), type = "prob"))
# predict class and then attach test class
chronicpredictions$predict <- names(chronicpredictions)[1:2][apply(chronicpredictions[,1:2], 1, which.max)]
chronicpredictions$observed <- chronic_test$effect_f
####ACUTE
predictions <- as.data.frame(predict(acuterf, acute_test%>% dplyr::select(-effect_f), type = "prob"))
# predict class and then attach test class
predictions$predict <- names(predictions)[1:2][apply(predictions[,1:2], 1, which.max)]
predictions$observed <- acute_test$effect_f
head(predictions)
## N Y predict observed
## 1 0.880 0.120 N N
## 2 0.808 0.192 N N
## 3 0.812 0.188 N N
## 4 0.702 0.298 N Y
## 5 0.622 0.378 N Y
## 6 0.564 0.436 N N
#### all
allpredictions <- as.data.frame(predict(allrf, all_test%>% dplyr::select(-effect_f), type = "prob"))
# predict class and then attach test class
allpredictions$predict <- names(allpredictions)[1:2][apply(allpredictions[,1:2], 1, which.max)]
allpredictions$observed <- all_test$effect_f
head(allpredictions)
## N Y predict observed
## 1 0.814 0.186 N Y
## 2 0.812 0.188 N Y
## 3 0.884 0.116 N N
## 4 0.876 0.124 N N
## 5 0.814 0.186 N Y
## 6 0.812 0.188 N N
###nofilter
nofilterpredictions <- as.data.frame(predict(nofilterrf, nofilter_test%>% dplyr::select(-effect_f), type = "prob"))
# predict class and then attach test class
nofilterpredictions$predict <- names(nofilterpredictions)[1:2][apply(nofilterpredictions[,1:2], 1, which.max)]
nofilterpredictions$observed <- nofilter_test$effect_f
###nofilterOptimized
nofilter.optimizedpredictions <- as.data.frame(predict(myrf_optimized, multiVar_small_test %>% dplyr::select(-effect_f), type = "prob"))
# predict class and then attach test class
nofilter.optimizedpredictions$predict <- names(nofilter.optimizedpredictions)[1:2][apply(nofilter.optimizedpredictions[,1:2], 1, which.max)]
nofilter.optimizedpredictions$observed <- multiVar_small_test$effect_f
###nofilterOptimizedImputed
nofilter.optimized.imputedpredictions <- as.data.frame(predict(myrf_optimized_imputed, multiVar_small_test_imputed %>% dplyr::select(-effect_f), type = "prob"))
# predict class and then attach test class
nofilter.optimized.imputedpredictions$predict <- names(nofilter.optimized.imputedpredictions)[1:2][apply(nofilter.optimized.imputedpredictions[,1:2], 1, which.max)]
nofilter.optimized.imputedpredictions$observed <- multiVar_small_test_imputed$effect_f
Plot.
require(ggdark)
# 1 ROC curve, yes vs no for acute
roc.acute <- roc(ifelse(predictions$observed=="Y", "Y", "N"), as.numeric(predictions$Y))
#chronic
roc.chronic <- roc(ifelse(chronicpredictions$observed=="Y", "Y", "N"), as.numeric(chronicpredictions$Y))
#all
roc.all <- roc(ifelse(allpredictions$observed=="Y", "Y", "N"), as.numeric(allpredictions$Y))
#nofilter
roc.nofilter <- roc(ifelse(nofilterpredictions$observed=="Y", "Y", "N"), as.numeric(nofilterpredictions$Y))
#no filter (optimized)
roc.nofilter.optimized <- roc(ifelse(nofilter.optimizedpredictions$observed=="Y", "Y", "N"), as.numeric(nofilter.optimizedpredictions$Y))
#no filter (optimized; imputed)
roc.nofilter.optimized.imputed <- roc(ifelse(nofilter.optimized.imputedpredictions$observed=="Y", "Y", "N"), as.numeric(nofilter.optimized.imputedpredictions$Y))
##make ROC curves
#all
allROC <- ggroc(roc.all, col = "yellow") +
labs(title = "Chronic and Acute",
subtitle = paste0(accuracy_all,', ',count_all)) +
dark_theme_bw()
#acute
acuteROC <- ggroc(roc.acute, col = "green") +
labs(title = "Acute",
subtitle = paste0(accuracy_acute,', ',count_acute)) +
dark_theme_bw()
#chronic
chronicROC <- ggroc(roc.chronic, col = "blue") +
labs(title = "Chronic",
subtitle = paste0(accuracy_chronic,', ',count_chronic)) + #auto label
dark_theme_bw()
#no filter
nofilterROC <- ggroc(roc.nofilter, col = "red3") +
labs(title = "Entire Dataset",
subtitle = paste0(accuracy_nofilter,', ',count_nofilter)) +
dark_theme_bw()
#optimized (rough fix)
nofilteroptimizedROC <- ggroc(roc.nofilter.optimized, col = "orangered") +
labs(title = "Entire Dataset (optimized)",
subtitle = paste0(accuracy_optimized,', ',count_optimized)) +
dark_theme_bw()
#optimized (multiple imputation)
nofilteroptimizedimputedROC <- ggroc(roc.nofilter.optimized.imputed, col = "orange") +
labs(title = "Entire Dataset (optimized; imputed)",
subtitle = paste0(accuracy_optimized_imputed,', ',count_optimized_imputed)) +
dark_theme_bw()
#arrange together and print
require(gridExtra)
grid.arrange(nofilterROC, nofilteroptimizedROC, nofilteroptimizedimputedROC,allROC, acuteROC, chronicROC,
ncol = 3)
ROC curves may also be visualized together
require(pROC)
require(tidyverse)
require(ggdark)
require(ggsci)
ggroc(list(all = roc.nofilter, optimized = roc.nofilter.optimized, optimized.imputed = roc.nofilter.optimized.imputed, organisms = roc.all, acute = roc.acute, chronic = roc.chronic), aes = c("linetype", "color")) +
labs(title = "ROC Curves for Aquatic Toxicity Random Forest",
subtitle = "n = 4615",
color = "Dataset",
linetype = "Dataset") +
scale_color_tron() +
# theme_bw(base_size = 20)
dark_theme_bw(base_size = 20)# +
theme(plot.title.position = element_text(hjust = 0.5),
plot.subtitle.position = element_text(hjust = 0.5))
## List of 2
## $ plot.title.position :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.subtitle.position:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0.5
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
Make a classification tree.
## n= 1214
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1214 247 N (0.79654 0.20346)
## 2) dose.mg.L.master< 4.753 1190 229 N (0.80756 0.19244)
## 4) dose.mg.L.master< 0.09109 549 66 N (0.87978 0.12022) *
## 5) dose.mg.L.master>=0.09109 641 163 N (0.74571 0.25429)
## 10) size.length.um.used.for.conversions>=0.5784 399 79 N (0.80201 0.19799) *
## 11) size.length.um.used.for.conversions< 0.5784 242 84 N (0.65289 0.34711)
## 22) polymer=PS 195 60 N (0.69231 0.30769)
## 44) dose.mg.L.master>=1.651 23 1 N (0.95652 0.04348) *
## 45) dose.mg.L.master< 1.651 172 59 N (0.65698 0.34302)
## 90) dose.mg.L.master< 1.224 111 27 N (0.75676 0.24324) *
## 91) dose.mg.L.master>=1.224 61 29 Y (0.47541 0.52459)
## 182) effect.score< 0.5396 30 11 N (0.63333 0.36667) *
## 183) effect.score>=0.5396 31 10 Y (0.32258 0.67742) *
## 23) polymer=Not Reported,PE 47 23 Y (0.48936 0.51064) *
## 3) dose.mg.L.master>=4.753 24 6 Y (0.25000 0.75000) *
Plot an interpretable tree.
## cex 1 xlim c(-0.2, 1.2) ylim c(0, 1)
GINI importance measures the average gain of purity by splits of a given variable. If the variable is useful, it tends to split mixed labeled nodes into pure single class nodes. Splitting by a permuted variables tend neither to increase nor decrease node purities. Permuting a useful variable, tend to give relatively large decrease in mean gini-gain. GINI importance is closely related to the local decision function, that random forest uses to select the best available split. Therefore, it does not take much extra time to compute. On the other hand, mean gini-gain in local splits, is not necessarily what is most useful to measure, in contrary to change of overall model performance. Gini importance is overall inferior to (permutation based) variable importance as it is relatively more biased, more unstable and tend to answer a more indirect question.
#predict(myqrf, what=c(0.2, 0.3, 0.999)) # to print specific quantiles
Again appears to improve after ~100 trees.
#3-D plots
#remotes::install_github("tylermorganwall/rayshader")
#require(rayshader)
require(tidyverse)
require(ggsci)
require(ggdark)
#scatterplot with size, dose and polymer
acute3D <- acute %>%
#filter(effect_f == "Yes") %>%
ggplot(aes(x = size.length.um.used.for.conversions, y = dose.mg.L.master, color = effect.score)) + #, shape = polymer)) +
geom_point(alpha = 0.6) +
geom_smooth() +
scale_y_continuous("Dose (mg/L)", limits=c(-4,5)) +
scale_color_viridis_c(option = "A") +
ggtitle("Toxicity Probability by Length, Dose (mg/L) and Shape") +
labs(caption = "Acute Aquatic Organism") +
dark_theme_classic() +
facet_wrap(shape~.)
acute3D
#3D plot
#plot_gg(acute3D, multicore = TRUE, width = 5, height = 5, scale =250) #3D plot
#scatterplot with size, dose and polymer
mass <-multiVar %>%
filter(!effect_f == "NA") %>%
filter(effect.metric == c("HONEC", "LOEC", "NOEC")) %>%
filter(!acute.chronic_f == "Unavailable") %>%
filter(bio.org == "organism") %>%
filter(!environment == "Terrestrial") %>%
filter(lvl1_f == "Fitness") %>%
ggplot(aes(x = size.length.um.used.for.conversions, y = dose.mg.L.master, color = effect_f)) +
geom_point(alpha = 0.5) +
geom_smooth() +
scale_y_continuous(limits = c(0, 300))+
# scale_y_log10("Dose (mg/L)",limits = c(1e-4, 1e7))+
scale_x_log10("Length (um)", limits = c(1, 1e3)) +
#coord_trans(x = "log10") +
#scale_x_continuous("Length (um)", breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
# labels = trans_format("log10", scales::math_format(10^.x))) +
scale_colour_locuszoom() +
ggtitle("Toxicity Probability by Length and Dose (mg/L)") +
labs(caption = "Aquatic Organisms, HONEC/LOEC/NOEC") +
dark_theme_classic()# +
#facet_wrap(acute.chronic_f ~.)
mass
particles <- multiVar %>%
filter(!effect_f == "NA") %>%
filter(effect.metric == c("HONEC", "LOEC", "NOEC")) %>%
filter(!acute.chronic_f == "Unavailable") %>%
filter(bio.org == "organism") %>%
filter(!environment == "Terrestrial") %>%
filter(lvl1_f == "Fitness") %>%
ggplot(aes(x = size.length.um.used.for.conversions, y = dose.particles.mL.master, color = effect_f)) +
geom_point(alpha = 0.5) +
geom_smooth() +
scale_y_log10("Dose (particles/mL)",limits = c(1e-4, 1e7))+
scale_x_log10("Length (um)", limits = c(1, 1e4)) +
#coord_trans(x = "log10") +
#scale_x_continuous("Length (um)", breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
# labels = trans_format("log10", scales::math_format(10^.x))) +
scale_colour_locuszoom() +
ggtitle("Toxicity Probability by Length and Dose (particles/mL)") +
labs(caption = "Aquatic Organisms, HONEC/LOEC/NOEC") +
dark_theme_classic() #+
#facet_wrap(acute.chronic_f ~.)
particles
volume <- aoc_z %>%
filter(!effect_f == "NA") %>%
filter(effect.metric == c("HONEC", "LOEC", "NOEC")) %>%
filter(!acute.chronic_f == "Unavailable") %>%
filter(bio.org == "organism") %>%
filter(!environment == "Terrestrial") %>%
filter(lvl1_f == "Fitness") %>%
ggplot(aes(x = size.length.um.used.for.conversions, y = dose.um3.mL.master, color = effect_f)) +
geom_point(alpha = 0.5) +
geom_smooth() +
scale_y_log10("Dose (um3/mL)",limits = c(1e+1, 1e7))+
scale_x_log10("Length (um3)", limits = c(1e-1, 100)) +
#coord_trans(x = "log10") +
#scale_x_continuous("Length (um)", breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
# labels = trans_format("log10", scales::math_format(10^.x))) +
scale_colour_locuszoom() +
ggtitle("Toxicity Probability by Length and Dose (um3/mL)") +
labs(caption = "Aquatic Organisms, HONEC/LOEC/NOEC") +
dark_theme_classic() #+
#facet_wrap(acute.chronic_f ~.)
volume
require(gridExtra)
grid.arrange(particles, mass, volume, nrow = 3)
#scatterplot with size, dose and polymer
taxa_mass <-aoc_z %>%
filter(!effect_f == "NA") %>%
# filter(effect.metric == c("HONEC", "LOEC", "NOEC")) %>%
filter(!acute.chronic_f == "Unavailable") %>%
filter(bio.org == "organism") %>%
filter(!environment == "Terrestrial") %>%
filter(lvl1_f == "Fitness") %>%
ggplot(aes(x = size.length.um.used.for.conversions, y = dose.mg.L.master.AF.noec, color = effect_f)) +
geom_point(alpha = 0.5) +
geom_smooth() +
scale_y_continuous(limits = c(0, 300))+
# scale_y_log10("Dose (mg/L)",limits = c(1e-4, 1e7))+
scale_x_log10("Length (um)", limits = c(1, 1e3)) +
#coord_trans(x = "log10") +
#scale_x_continuous("Length (um)", breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
# labels = trans_format("log10", scales::math_format(10^.x))) +
scale_colour_locuszoom() +
ggtitle("Toxicity Probability by Length and Dose (mg/L)") +
labs(caption = "Aquatic Organisms, HONEC/LOEC/NOEC") +
dark_theme_classic() +
facet_wrap(organism.group ~.)
taxa_mass
#Logistic Regression for acute fitness
m1_crust <-aoc_z %>%
filter(!effect_f == "NA") %>%
# filter(effect.metric == c("HONEC", "LOEC", "NOEC")) %>%
# filter(organism.group == "Crustacea") %>%
filter(!acute.chronic_f == "Unavailable") %>%
filter(bio.org == "organism") %>%
filter(!environment == "Terrestrial") %>%
filter(lvl1_f == "Fitness") %>%
filter(!size_f == "Not Reported") %>%
mutate(logdose.mg.L.master = log10(dose.mg.L.master))#%>%
# filter(acute.chronic_f == "Acute")
m1_crust_model <- glm(effect_10 ~ (size.length.um.used.for.conversions + log10(dose.mg.L.master) +
log10(dose.particles.mL.master) + organism.group) ^ 2, #exponent gives all 2-way interactions
data = m1_crust, na.action = "na.exclude", family = "binomial")
summary(m1_crust_model)
##
## Call:
## glm(formula = effect_10 ~ (size.length.um.used.for.conversions +
## log10(dose.mg.L.master) + log10(dose.particles.mL.master) +
## organism.group)^2, family = "binomial", data = m1_crust,
## na.action = "na.exclude")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2294 -0.7369 -0.6168 -0.1817 2.8654
##
## Coefficients:
## Estimate
## (Intercept) -9.780e-01
## size.length.um.used.for.conversions -1.785e-03
## log10(dose.mg.L.master) 2.711e-02
## log10(dose.particles.mL.master) -4.491e-02
## organism.groupFish 5.462e-01
## organism.groupMollusca -4.519e+00
## size.length.um.used.for.conversions:log10(dose.mg.L.master) -9.672e-05
## size.length.um.used.for.conversions:log10(dose.particles.mL.master) 2.284e-04
## size.length.um.used.for.conversions:organism.groupFish -2.064e-02
## size.length.um.used.for.conversions:organism.groupMollusca 3.543e-03
## log10(dose.mg.L.master):log10(dose.particles.mL.master) 4.640e-02
## log10(dose.mg.L.master):organism.groupFish 4.597e-01
## log10(dose.mg.L.master):organism.groupMollusca -4.120e-01
## log10(dose.particles.mL.master):organism.groupFish -3.123e-01
## log10(dose.particles.mL.master):organism.groupMollusca 4.461e-01
## Std. Error
## (Intercept) 2.058e-01
## size.length.um.used.for.conversions 1.346e-03
## log10(dose.mg.L.master) 1.003e-01
## log10(dose.particles.mL.master) 3.560e-02
## organism.groupFish 9.960e-01
## organism.groupMollusca 9.400e-01
## size.length.um.used.for.conversions:log10(dose.mg.L.master) 4.184e-04
## size.length.um.used.for.conversions:log10(dose.particles.mL.master) 4.093e-04
## size.length.um.used.for.conversions:organism.groupFish 1.259e-02
## size.length.um.used.for.conversions:organism.groupMollusca 1.997e-03
## log10(dose.mg.L.master):log10(dose.particles.mL.master) 1.959e-02
## log10(dose.mg.L.master):organism.groupFish 3.044e-01
## log10(dose.mg.L.master):organism.groupMollusca 2.247e-01
## log10(dose.particles.mL.master):organism.groupFish 2.055e-01
## log10(dose.particles.mL.master):organism.groupMollusca 1.068e-01
## z value
## (Intercept) -4.752
## size.length.um.used.for.conversions -1.326
## log10(dose.mg.L.master) 0.270
## log10(dose.particles.mL.master) -1.262
## organism.groupFish 0.548
## organism.groupMollusca -4.808
## size.length.um.used.for.conversions:log10(dose.mg.L.master) -0.231
## size.length.um.used.for.conversions:log10(dose.particles.mL.master) 0.558
## size.length.um.used.for.conversions:organism.groupFish -1.639
## size.length.um.used.for.conversions:organism.groupMollusca 1.774
## log10(dose.mg.L.master):log10(dose.particles.mL.master) 2.369
## log10(dose.mg.L.master):organism.groupFish 1.510
## log10(dose.mg.L.master):organism.groupMollusca -1.834
## log10(dose.particles.mL.master):organism.groupFish -1.520
## log10(dose.particles.mL.master):organism.groupMollusca 4.176
## Pr(>|z|)
## (Intercept) 2.01e-06
## size.length.um.used.for.conversions 0.1847
## log10(dose.mg.L.master) 0.7869
## log10(dose.particles.mL.master) 0.2071
## organism.groupFish 0.5834
## organism.groupMollusca 1.53e-06
## size.length.um.used.for.conversions:log10(dose.mg.L.master) 0.8172
## size.length.um.used.for.conversions:log10(dose.particles.mL.master) 0.5767
## size.length.um.used.for.conversions:organism.groupFish 0.1013
## size.length.um.used.for.conversions:organism.groupMollusca 0.0760
## log10(dose.mg.L.master):log10(dose.particles.mL.master) 0.0179
## log10(dose.mg.L.master):organism.groupFish 0.1309
## log10(dose.mg.L.master):organism.groupMollusca 0.0667
## log10(dose.particles.mL.master):organism.groupFish 0.1286
## log10(dose.particles.mL.master):organism.groupMollusca 2.96e-05
##
## (Intercept) ***
## size.length.um.used.for.conversions
## log10(dose.mg.L.master)
## log10(dose.particles.mL.master)
## organism.groupFish
## organism.groupMollusca ***
## size.length.um.used.for.conversions:log10(dose.mg.L.master)
## size.length.um.used.for.conversions:log10(dose.particles.mL.master)
## size.length.um.used.for.conversions:organism.groupFish
## size.length.um.used.for.conversions:organism.groupMollusca .
## log10(dose.mg.L.master):log10(dose.particles.mL.master) *
## log10(dose.mg.L.master):organism.groupFish
## log10(dose.mg.L.master):organism.groupMollusca .
## log10(dose.particles.mL.master):organism.groupFish
## log10(dose.particles.mL.master):organism.groupMollusca ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1311.5 on 1310 degrees of freedom
## Residual deviance: 1190.7 on 1296 degrees of freedom
## (293 observations deleted due to missingness)
## AIC: 1220.7
##
## Number of Fisher Scoring iterations: 9
m1_crust_model$coef
## (Intercept)
## -9.780446e-01
## size.length.um.used.for.conversions
## -1.785247e-03
## log10(dose.mg.L.master)
## 2.710767e-02
## log10(dose.particles.mL.master)
## -4.490859e-02
## organism.groupFish
## 5.461589e-01
## organism.groupMollusca
## -4.519370e+00
## size.length.um.used.for.conversions:log10(dose.mg.L.master)
## -9.671817e-05
## size.length.um.used.for.conversions:log10(dose.particles.mL.master)
## 2.284465e-04
## size.length.um.used.for.conversions:organism.groupFish
## -2.063551e-02
## size.length.um.used.for.conversions:organism.groupMollusca
## 3.542676e-03
## log10(dose.mg.L.master):log10(dose.particles.mL.master)
## 4.640464e-02
## log10(dose.mg.L.master):organism.groupFish
## 4.597273e-01
## log10(dose.mg.L.master):organism.groupMollusca
## -4.120115e-01
## log10(dose.particles.mL.master):organism.groupFish
## -3.122596e-01
## log10(dose.particles.mL.master):organism.groupMollusca
## 4.460948e-01
m1_crust_simple <- m1_crust %>%
dplyr::select(c(effect_10, logdose.mg.L.master, size.length.um.used.for.conversions)) %>%
drop_na
#simple single parameter
m1_crust_model_dose <- glm(effect_10 ~ logdose.mg.L.master * size.length.um.used.for.conversions, #exponent gives all 2-way interactions
data = m1_crust_simple, na.action = "na.exclude", family = "binomial")
summary(m1_crust_model_dose)
##
## Call:
## glm(formula = effect_10 ~ logdose.mg.L.master * size.length.um.used.for.conversions,
## family = "binomial", data = m1_crust_simple, na.action = "na.exclude")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8635 -0.7140 -0.6165 -0.5017 2.0663
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -1.418e+00 7.292e-02
## logdose.mg.L.master 1.970e-01 4.744e-02
## size.length.um.used.for.conversions -3.797e-04 1.787e-04
## logdose.mg.L.master:size.length.um.used.for.conversions 1.016e-04 4.554e-05
## z value Pr(>|z|)
## (Intercept) -19.447 < 2e-16 ***
## logdose.mg.L.master 4.152 3.29e-05 ***
## size.length.um.used.for.conversions -2.125 0.0336 *
## logdose.mg.L.master:size.length.um.used.for.conversions 2.230 0.0257 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1358.5 on 1347 degrees of freedom
## Residual deviance: 1316.5 on 1344 degrees of freedom
## AIC: 1324.5
##
## Number of Fisher Scoring iterations: 4
#alternative plot with ggpredict
#devtools::install_github("cardiomoon/ggiraphExtra")
require(ggiraphExtra)
ggPredict(m1_crust_model_dose,interactive=TRUE,colorn=100,jitter=FALSE)
What happens if we log-transfrom size?
What about volume?
m1_crust_simple_volume <- m1_crust %>%
mutate(logdose.um3.mL.master = log10(dose.um3.mL.master)) %>%
dplyr::select(c(effect_10, logdose.um3.mL.master, size.length.um.used.for.conversions)) %>%
drop_na
#simple single parameter
m1_crust_model_volume<- glm(effect_10 ~ logdose.um3.mL.master * size.length.um.used.for.conversions, #exponent gives all 2-way interactions
data = m1_crust_simple_volume, na.action = "na.exclude", family = "binomial")
summary(m1_crust_model_volume)
##
## Call:
## glm(formula = effect_10 ~ logdose.um3.mL.master * size.length.um.used.for.conversions,
## family = "binomial", data = m1_crust_simple_volume, na.action = "na.exclude")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8116 -0.6866 -0.6282 -0.5292 2.0869
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -2.176e+00 2.574e-01
## logdose.um3.mL.master 1.262e-01 4.049e-02
## size.length.um.used.for.conversions -1.176e-03 4.464e-04
## logdose.um3.mL.master:size.length.um.used.for.conversions 1.259e-04 4.533e-05
## z value Pr(>|z|)
## (Intercept) -8.451 < 2e-16 ***
## logdose.um3.mL.master 3.118 0.00182 **
## size.length.um.used.for.conversions -2.635 0.00842 **
## logdose.um3.mL.master:size.length.um.used.for.conversions 2.777 0.00549 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1311.5 on 1310 degrees of freedom
## Residual deviance: 1277.6 on 1307 degrees of freedom
## AIC: 1285.6
##
## Number of Fisher Scoring iterations: 4
#plot logistic volume
require(ggiraphExtra)
ggPredict(m1_crust_model_volume,interactive=TRUE,colorn=100,jitter=FALSE)
#scatterplot with size, dose and polymer
mass_probability_crustacea <-aoc_z %>%
filter(!effect_f == "NA") %>%
# filter(effect.metric == c("HONEC", "LOEC", "NOEC")) %>%
filter(organism.group == "Crustacea") %>%
filter(!acute.chronic_f == "Unavailable") %>%
filter(bio.org == "organism") %>%
filter(!environment == "Terrestrial") %>%
filter(lvl1_f == "Fitness") %>%
filter(!size_f == "Not Reported") %>%
droplevels() %>%
ggplot(aes(x = dose.mg.L.master.AF.noec, y = effect_10, color = effect_f)) +
geom_point(alpha = 0.5) +
geom_smooth() +
scale_y_continuous(limits = c(0, 1))+
scale_x_log10("Dose (mg/L)",limits = c(1e-4, 1e7))+
#scale_x_log10("Length (um)", limits = c(1, 1e3)) +
#coord_trans(x = "log10") +
#scale_x_continuous("Length (um)", breaks = scales::trans_breaks("log10", function(x) 10^x, n = 5),
# labels = trans_format("log10", scales::math_format(10^.x))) +
scale_colour_locuszoom() +
ggtitle("Toxicity Probability by Length and Dose (mg/L)") +
labs(caption = "Aquatic Organisms, HONEC/LOEC/NOEC") +
dark_theme_classic() +
facet_wrap(size_f ~.)
mass_probability_crustacea